bpo-36546: Add statistics.quantiles() (#12710)
This commit is contained in:
parent
d437012cdd
commit
9013ccf6d8
|
@ -48,6 +48,7 @@ or sample.
|
||||||
:func:`median_grouped` Median, or 50th percentile, of grouped data.
|
:func:`median_grouped` Median, or 50th percentile, of grouped data.
|
||||||
:func:`mode` Single mode (most common value) of discrete or nominal data.
|
:func:`mode` Single mode (most common value) of discrete or nominal data.
|
||||||
:func:`multimode` List of modes (most common values) of discrete or nomimal data.
|
:func:`multimode` List of modes (most common values) of discrete or nomimal data.
|
||||||
|
:func:`quantiles` Divide data into intervals with equal probability.
|
||||||
======================= ===============================================================
|
======================= ===============================================================
|
||||||
|
|
||||||
Measures of spread
|
Measures of spread
|
||||||
|
@ -499,6 +500,53 @@ However, for reading convenience, most of the examples show sorted sequences.
|
||||||
:func:`pvariance` function as the *mu* parameter to get the variance of a
|
:func:`pvariance` function as the *mu* parameter to get the variance of a
|
||||||
sample.
|
sample.
|
||||||
|
|
||||||
|
.. function:: quantiles(dist, *, n=4, method='exclusive')
|
||||||
|
|
||||||
|
Divide *dist* into *n* continuous intervals with equal probability.
|
||||||
|
Returns a list of ``n - 1`` cut points separating the intervals.
|
||||||
|
|
||||||
|
Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. Set
|
||||||
|
*n* to 100 for percentiles which gives the 99 cuts points that separate
|
||||||
|
*dist* in to 100 equal sized groups. Raises :exc:`StatisticsError` if *n*
|
||||||
|
is not least 1.
|
||||||
|
|
||||||
|
The *dist* can be any iterable containing sample data or it can be an
|
||||||
|
instance of a class that defines an :meth:`~inv_cdf` method.
|
||||||
|
Raises :exc:`StatisticsError` if there are not at least two data points.
|
||||||
|
|
||||||
|
For sample data, the cut points are linearly interpolated from the
|
||||||
|
two nearest data points. For example, if a cut point falls one-third
|
||||||
|
of the distance between two sample values, ``100`` and ``112``, the
|
||||||
|
cut-point will evaluate to ``104``. Other selection methods may be
|
||||||
|
offered in the future (for example choose ``100`` as the nearest
|
||||||
|
value or compute ``106`` as the midpoint). This might matter if
|
||||||
|
there are too few samples for a given number of cut points.
|
||||||
|
|
||||||
|
If *method* is set to *inclusive*, *dist* is treated as population data.
|
||||||
|
The minimum value is treated as the 0th percentile and the maximum
|
||||||
|
value is treated as the 100th percentile. If *dist* is an instance of
|
||||||
|
a class that defines an :meth:`~inv_cdf` method, setting *method*
|
||||||
|
has no effect.
|
||||||
|
|
||||||
|
.. doctest::
|
||||||
|
|
||||||
|
# Decile cut points for empirically sampled data
|
||||||
|
>>> data = [105, 129, 87, 86, 111, 111, 89, 81, 108, 92, 110,
|
||||||
|
... 100, 75, 105, 103, 109, 76, 119, 99, 91, 103, 129,
|
||||||
|
... 106, 101, 84, 111, 74, 87, 86, 103, 103, 106, 86,
|
||||||
|
... 111, 75, 87, 102, 121, 111, 88, 89, 101, 106, 95,
|
||||||
|
... 103, 107, 101, 81, 109, 104]
|
||||||
|
>>> [round(q, 1) for q in quantiles(data, n=10)]
|
||||||
|
[81.0, 86.2, 89.0, 99.4, 102.5, 103.6, 106.0, 109.8, 111.0]
|
||||||
|
|
||||||
|
>>> # Quartile cut points for the standard normal distibution
|
||||||
|
>>> Z = NormalDist()
|
||||||
|
>>> [round(q, 4) for q in quantiles(Z, n=4)]
|
||||||
|
[-0.6745, 0.0, 0.6745]
|
||||||
|
|
||||||
|
.. versionadded:: 3.8
|
||||||
|
|
||||||
|
|
||||||
Exceptions
|
Exceptions
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
@ -606,7 +654,7 @@ of applications in statistics.
|
||||||
<http://www.iceaaonline.com/ready/wp-content/uploads/2014/06/MM-9-Presentation-Meet-the-Overlapping-Coefficient-A-Measure-for-Elevator-Speeches.pdf>`_
|
<http://www.iceaaonline.com/ready/wp-content/uploads/2014/06/MM-9-Presentation-Meet-the-Overlapping-Coefficient-A-Measure-for-Elevator-Speeches.pdf>`_
|
||||||
between two normal distributions, giving a measure of agreement.
|
between two normal distributions, giving a measure of agreement.
|
||||||
Returns a value between 0.0 and 1.0 giving `the overlapping area for
|
Returns a value between 0.0 and 1.0 giving `the overlapping area for
|
||||||
two probability density functions
|
the two probability density functions
|
||||||
<https://www.rasch.org/rmt/rmt101r.htm>`_.
|
<https://www.rasch.org/rmt/rmt101r.htm>`_.
|
||||||
|
|
||||||
Instances of :class:`NormalDist` support addition, subtraction,
|
Instances of :class:`NormalDist` support addition, subtraction,
|
||||||
|
@ -649,8 +697,8 @@ of applications in statistics.
|
||||||
For example, given `historical data for SAT exams
|
For example, given `historical data for SAT exams
|
||||||
<https://blog.prepscholar.com/sat-standard-deviation>`_ showing that scores
|
<https://blog.prepscholar.com/sat-standard-deviation>`_ showing that scores
|
||||||
are normally distributed with a mean of 1060 and a standard deviation of 192,
|
are normally distributed with a mean of 1060 and a standard deviation of 192,
|
||||||
determine the percentage of students with scores between 1100 and 1200, after
|
determine the percentage of students with test scores between 1100 and
|
||||||
rounding to the nearest whole number:
|
1200, after rounding to the nearest whole number:
|
||||||
|
|
||||||
.. doctest::
|
.. doctest::
|
||||||
|
|
||||||
|
|
|
@ -337,6 +337,10 @@ Added :func:`statistics.geometric_mean()`
|
||||||
Added :func:`statistics.multimode` that returns a list of the most
|
Added :func:`statistics.multimode` that returns a list of the most
|
||||||
common values. (Contributed by Raymond Hettinger in :issue:`35892`.)
|
common values. (Contributed by Raymond Hettinger in :issue:`35892`.)
|
||||||
|
|
||||||
|
Added :func:`statistics.quantiles` that divides data or a distribution
|
||||||
|
in to equiprobable intervals (e.g. quartiles, deciles, or percentiles).
|
||||||
|
(Contributed by Raymond Hettinger in :issue:`36546`.)
|
||||||
|
|
||||||
Added :class:`statistics.NormalDist`, a tool for creating
|
Added :class:`statistics.NormalDist`, a tool for creating
|
||||||
and manipulating normal distributions of a random variable.
|
and manipulating normal distributions of a random variable.
|
||||||
(Contributed by Raymond Hettinger in :issue:`36018`.)
|
(Contributed by Raymond Hettinger in :issue:`36018`.)
|
||||||
|
|
|
@ -7,9 +7,9 @@ averages, variance, and standard deviation.
|
||||||
Calculating averages
|
Calculating averages
|
||||||
--------------------
|
--------------------
|
||||||
|
|
||||||
================== =============================================
|
================== ==================================================
|
||||||
Function Description
|
Function Description
|
||||||
================== =============================================
|
================== ==================================================
|
||||||
mean Arithmetic mean (average) of data.
|
mean Arithmetic mean (average) of data.
|
||||||
geometric_mean Geometric mean of data.
|
geometric_mean Geometric mean of data.
|
||||||
harmonic_mean Harmonic mean of data.
|
harmonic_mean Harmonic mean of data.
|
||||||
|
@ -19,7 +19,8 @@ median_high High median of data.
|
||||||
median_grouped Median, or 50th percentile, of grouped data.
|
median_grouped Median, or 50th percentile, of grouped data.
|
||||||
mode Mode (most common value) of data.
|
mode Mode (most common value) of data.
|
||||||
multimode List of modes (most common values of data).
|
multimode List of modes (most common values of data).
|
||||||
================== =============================================
|
quantiles Divide data into intervals with equal probability.
|
||||||
|
================== ==================================================
|
||||||
|
|
||||||
Calculate the arithmetic mean ("the average") of data:
|
Calculate the arithmetic mean ("the average") of data:
|
||||||
|
|
||||||
|
@ -78,7 +79,7 @@ A single exception is defined: StatisticsError is a subclass of ValueError.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__all__ = [ 'StatisticsError', 'NormalDist',
|
__all__ = [ 'StatisticsError', 'NormalDist', 'quantiles',
|
||||||
'pstdev', 'pvariance', 'stdev', 'variance',
|
'pstdev', 'pvariance', 'stdev', 'variance',
|
||||||
'median', 'median_low', 'median_high', 'median_grouped',
|
'median', 'median_low', 'median_high', 'median_grouped',
|
||||||
'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
|
'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
|
||||||
|
@ -562,6 +563,54 @@ def multimode(data):
|
||||||
maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
|
maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
|
||||||
return list(map(itemgetter(0), mode_items))
|
return list(map(itemgetter(0), mode_items))
|
||||||
|
|
||||||
|
def quantiles(dist, *, n=4, method='exclusive'):
|
||||||
|
'''Divide *dist* into *n* continuous intervals with equal probability.
|
||||||
|
|
||||||
|
Returns a list of (n - 1) cut points separating the intervals.
|
||||||
|
|
||||||
|
Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles.
|
||||||
|
Set *n* to 100 for percentiles which gives the 99 cuts points that
|
||||||
|
separate *dist* in to 100 equal sized groups.
|
||||||
|
|
||||||
|
The *dist* can be any iterable containing sample data or it can be
|
||||||
|
an instance of a class that defines an inv_cdf() method. For sample
|
||||||
|
data, the cut points are linearly interpolated between data points.
|
||||||
|
|
||||||
|
If *method* is set to *inclusive*, *dist* is treated as population
|
||||||
|
data. The minimum value is treated as the 0th percentile and the
|
||||||
|
maximum value is treated as the 100th percentile.
|
||||||
|
'''
|
||||||
|
# Possible future API extensions:
|
||||||
|
# quantiles(data, already_sorted=True)
|
||||||
|
# quantiles(data, cut_points=[0.02, 0.25, 0.50, 0.75, 0.98])
|
||||||
|
if n < 1:
|
||||||
|
raise StatisticsError('n must be at least 1')
|
||||||
|
if hasattr(dist, 'inv_cdf'):
|
||||||
|
return [dist.inv_cdf(i / n) for i in range(1, n)]
|
||||||
|
data = sorted(dist)
|
||||||
|
ld = len(data)
|
||||||
|
if ld < 2:
|
||||||
|
raise StatisticsError('must have at least two data points')
|
||||||
|
if method == 'inclusive':
|
||||||
|
m = ld - 1
|
||||||
|
result = []
|
||||||
|
for i in range(1, n):
|
||||||
|
j = i * m // n
|
||||||
|
delta = i*m - j*n
|
||||||
|
interpolated = (data[j] * (n - delta) + data[j+1] * delta) / n
|
||||||
|
result.append(interpolated)
|
||||||
|
return result
|
||||||
|
if method == 'exclusive':
|
||||||
|
m = ld + 1
|
||||||
|
result = []
|
||||||
|
for i in range(1, n):
|
||||||
|
j = i * m // n # rescale i to m/n
|
||||||
|
j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
|
||||||
|
delta = i*m - j*n # exact integer math
|
||||||
|
interpolated = (data[j-1] * (n - delta) + data[j] * delta) / n
|
||||||
|
result.append(interpolated)
|
||||||
|
return result
|
||||||
|
raise ValueError(f'Unknown method: {method!r}')
|
||||||
|
|
||||||
# === Measures of spread ===
|
# === Measures of spread ===
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ approx_equal function.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import bisect
|
||||||
import collections
|
import collections
|
||||||
import collections.abc
|
import collections.abc
|
||||||
import copy
|
import copy
|
||||||
|
@ -2038,6 +2039,7 @@ class TestStdev(VarianceStdevMixin, NumericTestCase):
|
||||||
expected = math.sqrt(statistics.variance(data))
|
expected = math.sqrt(statistics.variance(data))
|
||||||
self.assertEqual(self.func(data), expected)
|
self.assertEqual(self.func(data), expected)
|
||||||
|
|
||||||
|
|
||||||
class TestGeometricMean(unittest.TestCase):
|
class TestGeometricMean(unittest.TestCase):
|
||||||
|
|
||||||
def test_basics(self):
|
def test_basics(self):
|
||||||
|
@ -2126,6 +2128,146 @@ class TestGeometricMean(unittest.TestCase):
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
geometric_mean([Inf, -Inf])
|
geometric_mean([Inf, -Inf])
|
||||||
|
|
||||||
|
|
||||||
|
class TestQuantiles(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_specific_cases(self):
|
||||||
|
# Match results computed by hand and cross-checked
|
||||||
|
# against the PERCENTILE.EXC function in MS Excel.
|
||||||
|
quantiles = statistics.quantiles
|
||||||
|
data = [120, 200, 250, 320, 350]
|
||||||
|
random.shuffle(data)
|
||||||
|
for n, expected in [
|
||||||
|
(1, []),
|
||||||
|
(2, [250.0]),
|
||||||
|
(3, [200.0, 320.0]),
|
||||||
|
(4, [160.0, 250.0, 335.0]),
|
||||||
|
(5, [136.0, 220.0, 292.0, 344.0]),
|
||||||
|
(6, [120.0, 200.0, 250.0, 320.0, 350.0]),
|
||||||
|
(8, [100.0, 160.0, 212.5, 250.0, 302.5, 335.0, 357.5]),
|
||||||
|
(10, [88.0, 136.0, 184.0, 220.0, 250.0, 292.0, 326.0, 344.0, 362.0]),
|
||||||
|
(12, [80.0, 120.0, 160.0, 200.0, 225.0, 250.0, 285.0, 320.0, 335.0,
|
||||||
|
350.0, 365.0]),
|
||||||
|
(15, [72.0, 104.0, 136.0, 168.0, 200.0, 220.0, 240.0, 264.0, 292.0,
|
||||||
|
320.0, 332.0, 344.0, 356.0, 368.0]),
|
||||||
|
]:
|
||||||
|
self.assertEqual(expected, quantiles(data, n=n))
|
||||||
|
self.assertEqual(len(quantiles(data, n=n)), n - 1)
|
||||||
|
self.assertEqual(list(map(float, expected)),
|
||||||
|
quantiles(map(Decimal, data), n=n))
|
||||||
|
self.assertEqual(list(map(Decimal, expected)),
|
||||||
|
quantiles(map(Decimal, data), n=n))
|
||||||
|
self.assertEqual(list(map(Fraction, expected)),
|
||||||
|
quantiles(map(Fraction, data), n=n))
|
||||||
|
# Invariant under tranlation and scaling
|
||||||
|
def f(x):
|
||||||
|
return 3.5 * x - 1234.675
|
||||||
|
exp = list(map(f, expected))
|
||||||
|
act = quantiles(map(f, data), n=n)
|
||||||
|
self.assertTrue(all(math.isclose(e, a) for e, a in zip(exp, act)))
|
||||||
|
# Quartiles of a standard normal distribution
|
||||||
|
for n, expected in [
|
||||||
|
(1, []),
|
||||||
|
(2, [0.0]),
|
||||||
|
(3, [-0.4307, 0.4307]),
|
||||||
|
(4 ,[-0.6745, 0.0, 0.6745]),
|
||||||
|
]:
|
||||||
|
actual = quantiles(statistics.NormalDist(), n=n)
|
||||||
|
self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001)
|
||||||
|
for e, a in zip(expected, actual)))
|
||||||
|
|
||||||
|
def test_specific_cases_inclusive(self):
|
||||||
|
# Match results computed by hand and cross-checked
|
||||||
|
# against the PERCENTILE.INC function in MS Excel
|
||||||
|
# and against the quaatile() function in SciPy.
|
||||||
|
quantiles = statistics.quantiles
|
||||||
|
data = [100, 200, 400, 800]
|
||||||
|
random.shuffle(data)
|
||||||
|
for n, expected in [
|
||||||
|
(1, []),
|
||||||
|
(2, [300.0]),
|
||||||
|
(3, [200.0, 400.0]),
|
||||||
|
(4, [175.0, 300.0, 500.0]),
|
||||||
|
(5, [160.0, 240.0, 360.0, 560.0]),
|
||||||
|
(6, [150.0, 200.0, 300.0, 400.0, 600.0]),
|
||||||
|
(8, [137.5, 175, 225.0, 300.0, 375.0, 500.0,650.0]),
|
||||||
|
(10, [130.0, 160.0, 190.0, 240.0, 300.0, 360.0, 440.0, 560.0, 680.0]),
|
||||||
|
(12, [125.0, 150.0, 175.0, 200.0, 250.0, 300.0, 350.0, 400.0,
|
||||||
|
500.0, 600.0, 700.0]),
|
||||||
|
(15, [120.0, 140.0, 160.0, 180.0, 200.0, 240.0, 280.0, 320.0, 360.0,
|
||||||
|
400.0, 480.0, 560.0, 640.0, 720.0]),
|
||||||
|
]:
|
||||||
|
self.assertEqual(expected, quantiles(data, n=n, method="inclusive"))
|
||||||
|
self.assertEqual(len(quantiles(data, n=n, method="inclusive")), n - 1)
|
||||||
|
self.assertEqual(list(map(float, expected)),
|
||||||
|
quantiles(map(Decimal, data), n=n, method="inclusive"))
|
||||||
|
self.assertEqual(list(map(Decimal, expected)),
|
||||||
|
quantiles(map(Decimal, data), n=n, method="inclusive"))
|
||||||
|
self.assertEqual(list(map(Fraction, expected)),
|
||||||
|
quantiles(map(Fraction, data), n=n, method="inclusive"))
|
||||||
|
# Invariant under tranlation and scaling
|
||||||
|
def f(x):
|
||||||
|
return 3.5 * x - 1234.675
|
||||||
|
exp = list(map(f, expected))
|
||||||
|
act = quantiles(map(f, data), n=n, method="inclusive")
|
||||||
|
self.assertTrue(all(math.isclose(e, a) for e, a in zip(exp, act)))
|
||||||
|
# Quartiles of a standard normal distribution
|
||||||
|
for n, expected in [
|
||||||
|
(1, []),
|
||||||
|
(2, [0.0]),
|
||||||
|
(3, [-0.4307, 0.4307]),
|
||||||
|
(4 ,[-0.6745, 0.0, 0.6745]),
|
||||||
|
]:
|
||||||
|
actual = quantiles(statistics.NormalDist(), n=n, method="inclusive")
|
||||||
|
self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001)
|
||||||
|
for e, a in zip(expected, actual)))
|
||||||
|
|
||||||
|
def test_equal_sized_groups(self):
|
||||||
|
quantiles = statistics.quantiles
|
||||||
|
total = 10_000
|
||||||
|
data = [random.expovariate(0.2) for i in range(total)]
|
||||||
|
while len(set(data)) != total:
|
||||||
|
data.append(random.expovariate(0.2))
|
||||||
|
data.sort()
|
||||||
|
|
||||||
|
# Cases where the group size exactly divides the total
|
||||||
|
for n in (1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000):
|
||||||
|
group_size = total // n
|
||||||
|
self.assertEqual(
|
||||||
|
[bisect.bisect(data, q) for q in quantiles(data, n=n)],
|
||||||
|
list(range(group_size, total, group_size)))
|
||||||
|
|
||||||
|
# When the group sizes can't be exactly equal, they should
|
||||||
|
# differ by no more than one
|
||||||
|
for n in (13, 19, 59, 109, 211, 571, 1019, 1907, 5261, 9769):
|
||||||
|
group_sizes = {total // n, total // n + 1}
|
||||||
|
pos = [bisect.bisect(data, q) for q in quantiles(data, n=n)]
|
||||||
|
sizes = {q - p for p, q in zip(pos, pos[1:])}
|
||||||
|
self.assertTrue(sizes <= group_sizes)
|
||||||
|
|
||||||
|
def test_error_cases(self):
|
||||||
|
quantiles = statistics.quantiles
|
||||||
|
StatisticsError = statistics.StatisticsError
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
quantiles() # Missing arguments
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
quantiles([10, 20, 30], 13, n=4) # Too many arguments
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
quantiles([10, 20, 30], 4) # n is a positional argument
|
||||||
|
with self.assertRaises(StatisticsError):
|
||||||
|
quantiles([10, 20, 30], n=0) # n is zero
|
||||||
|
with self.assertRaises(StatisticsError):
|
||||||
|
quantiles([10, 20, 30], n=-1) # n is negative
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
quantiles([10, 20, 30], n=1.5) # n is not an integer
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
quantiles([10, 20, 30], method='X') # method is unknown
|
||||||
|
with self.assertRaises(StatisticsError):
|
||||||
|
quantiles([10], n=4) # not enough data points
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
quantiles([10, None, 30], n=4) # data is non-numeric
|
||||||
|
|
||||||
|
|
||||||
class TestNormalDist(unittest.TestCase):
|
class TestNormalDist(unittest.TestCase):
|
||||||
|
|
||||||
# General note on precision: The pdf(), cdf(), and overlap() methods
|
# General note on precision: The pdf(), cdf(), and overlap() methods
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Add statistics.quantiles()
|
Loading…
Reference in New Issue