bpo-35892: Fix mode() and add multimode() (#12089)
This commit is contained in:
parent
3e936431e2
commit
fc06a192fd
|
@ -37,7 +37,7 @@ Averages and measures of central location
|
|||
These functions calculate an average or typical value from a population
|
||||
or sample.
|
||||
|
||||
======================= =============================================
|
||||
======================= ===============================================================
|
||||
:func:`mean` Arithmetic mean ("average") of data.
|
||||
:func:`fmean` Fast, floating point arithmetic mean.
|
||||
:func:`harmonic_mean` Harmonic mean of data.
|
||||
|
@ -45,8 +45,9 @@ or sample.
|
|||
:func:`median_low` Low median of data.
|
||||
:func:`median_high` High median of data.
|
||||
:func:`median_grouped` Median, or 50th percentile, of grouped data.
|
||||
:func:`mode` Mode (most common value) of discrete data.
|
||||
======================= =============================================
|
||||
:func:`mode` Single mode (most common value) of discrete or nominal data.
|
||||
:func:`multimode` List of modes (most common values) of discrete or nomimal data.
|
||||
======================= ===============================================================
|
||||
|
||||
Measures of spread
|
||||
------------------
|
||||
|
@ -287,12 +288,12 @@ However, for reading convenience, most of the examples show sorted sequences.
|
|||
|
||||
.. function:: mode(data)
|
||||
|
||||
Return the most common data point from discrete or nominal *data*. The mode
|
||||
(when it exists) is the most typical value, and is a robust measure of
|
||||
central location.
|
||||
Return the single most common data point from discrete or nominal *data*.
|
||||
The mode (when it exists) is the most typical value and serves as a
|
||||
measure of central location.
|
||||
|
||||
If *data* is empty, or if there is not exactly one most common value,
|
||||
:exc:`StatisticsError` is raised.
|
||||
If there are multiple modes, returns the first one encountered in the *data*.
|
||||
If *data* is empty, :exc:`StatisticsError` is raised.
|
||||
|
||||
``mode`` assumes discrete data, and returns a single value. This is the
|
||||
standard treatment of the mode as commonly taught in schools:
|
||||
|
@ -310,6 +311,27 @@ However, for reading convenience, most of the examples show sorted sequences.
|
|||
>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
|
||||
'red'
|
||||
|
||||
.. versionchanged:: 3.8
|
||||
Now handles multimodal datasets by returning the first mode encountered.
|
||||
Formerly, it raised :exc:`StatisticsError` when more than one mode was
|
||||
found.
|
||||
|
||||
|
||||
.. function:: multimode(data)
|
||||
|
||||
Return a list of the most frequently occurring values in the order they
|
||||
were first encountered in the *data*. Will return more than one result if
|
||||
there are multiple modes or an empty list if the *data* is empty:
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> multimode('aabbbbccddddeeffffgg')
|
||||
['b', 'd', 'f']
|
||||
>>> multimode('')
|
||||
[]
|
||||
|
||||
.. versionadded:: 3.8
|
||||
|
||||
|
||||
.. function:: pstdev(data, mu=None)
|
||||
|
||||
|
|
|
@ -282,6 +282,9 @@ Added :func:`statistics.fmean` as a faster, floating point variant of
|
|||
:func:`statistics.mean()`. (Contributed by Raymond Hettinger and
|
||||
Steven D'Aprano in :issue:`35904`.)
|
||||
|
||||
Added :func:`statistics.multimode` that returns a list of the most
|
||||
common values. (Contributed by Raymond Hettinger in :issue:`35892`.)
|
||||
|
||||
Added :class:`statistics.NormalDist`, a tool for creating
|
||||
and manipulating normal distributions of a random variable.
|
||||
(Contributed by Raymond Hettinger in :issue:`36018`.)
|
||||
|
@ -591,6 +594,11 @@ Changes in the Python API
|
|||
* The function :func:`platform.popen` has been removed, it was deprecated since
|
||||
Python 3.3: use :func:`os.popen` instead.
|
||||
|
||||
* The :func:`statistics.mode` function no longer raises an exception
|
||||
when given multimodal data. Instead, it returns the first mode
|
||||
encountered in the input data. (Contributed by Raymond Hettinger
|
||||
in :issue:`35892`.)
|
||||
|
||||
* The :meth:`~tkinter.ttk.Treeview.selection` method of the
|
||||
:class:`tkinter.ttk.Treeview` class no longer takes arguments. Using it with
|
||||
arguments for changing the selection was deprecated in Python 3.6. Use
|
||||
|
|
|
@ -17,6 +17,7 @@ median_low Low median of data.
|
|||
median_high High median of data.
|
||||
median_grouped Median, or 50th percentile, of grouped data.
|
||||
mode Mode (most common value) of data.
|
||||
multimode List of modes (most common values of data)
|
||||
================== =============================================
|
||||
|
||||
Calculate the arithmetic mean ("the average") of data:
|
||||
|
@ -79,10 +80,9 @@ A single exception is defined: StatisticsError is a subclass of ValueError.
|
|||
__all__ = [ 'StatisticsError', 'NormalDist',
|
||||
'pstdev', 'pvariance', 'stdev', 'variance',
|
||||
'median', 'median_low', 'median_high', 'median_grouped',
|
||||
'mean', 'mode', 'harmonic_mean', 'fmean',
|
||||
'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
|
||||
]
|
||||
|
||||
import collections
|
||||
import math
|
||||
import numbers
|
||||
import random
|
||||
|
@ -92,8 +92,8 @@ from decimal import Decimal
|
|||
from itertools import groupby
|
||||
from bisect import bisect_left, bisect_right
|
||||
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
|
||||
|
||||
|
||||
from operator import itemgetter
|
||||
from collections import Counter
|
||||
|
||||
# === Exceptions ===
|
||||
|
||||
|
@ -249,20 +249,6 @@ def _convert(value, T):
|
|||
raise
|
||||
|
||||
|
||||
def _counts(data):
|
||||
# Generate a table of sorted (value, frequency) pairs.
|
||||
table = collections.Counter(iter(data)).most_common()
|
||||
if not table:
|
||||
return table
|
||||
# Extract the values with the highest frequency.
|
||||
maxfreq = table[0][1]
|
||||
for i in range(1, len(table)):
|
||||
if table[i][1] != maxfreq:
|
||||
table = table[:i]
|
||||
break
|
||||
return table
|
||||
|
||||
|
||||
def _find_lteq(a, x):
|
||||
'Locate the leftmost value exactly equal to x'
|
||||
i = bisect_left(a, x)
|
||||
|
@ -334,9 +320,9 @@ def fmean(data):
|
|||
nonlocal n
|
||||
n += 1
|
||||
return x
|
||||
total = math.fsum(map(count, data))
|
||||
total = fsum(map(count, data))
|
||||
else:
|
||||
total = math.fsum(data)
|
||||
total = fsum(data)
|
||||
try:
|
||||
return total / n
|
||||
except ZeroDivisionError:
|
||||
|
@ -523,19 +509,38 @@ def mode(data):
|
|||
>>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
|
||||
'red'
|
||||
|
||||
If there is not exactly one most common value, ``mode`` will raise
|
||||
StatisticsError.
|
||||
If there are multiple modes, return the first one encountered.
|
||||
|
||||
>>> mode(['red', 'red', 'green', 'blue', 'blue'])
|
||||
'red'
|
||||
|
||||
If *data* is empty, ``mode``, raises StatisticsError.
|
||||
|
||||
"""
|
||||
# Generate a table of sorted (value, frequency) pairs.
|
||||
table = _counts(data)
|
||||
if len(table) == 1:
|
||||
return table[0][0]
|
||||
elif table:
|
||||
raise StatisticsError(
|
||||
'no unique mode; found %d equally common values' % len(table)
|
||||
)
|
||||
else:
|
||||
raise StatisticsError('no mode for empty data')
|
||||
data = iter(data)
|
||||
try:
|
||||
return Counter(data).most_common(1)[0][0]
|
||||
except IndexError:
|
||||
raise StatisticsError('no mode for empty data') from None
|
||||
|
||||
|
||||
def multimode(data):
|
||||
""" Return a list of the most frequently occurring values.
|
||||
|
||||
Will return more than one result if there are multiple modes
|
||||
or an empty list if *data* is empty.
|
||||
|
||||
>>> multimode('aabbbbbbbbcc')
|
||||
['b']
|
||||
>>> multimode('aabbbbccddddeeffffgg')
|
||||
['b', 'd', 'f']
|
||||
>>> multimode('')
|
||||
[]
|
||||
|
||||
"""
|
||||
counts = Counter(iter(data)).most_common()
|
||||
maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
|
||||
return list(map(itemgetter(0), mode_items))
|
||||
|
||||
|
||||
# === Measures of spread ===
|
||||
|
@ -836,6 +841,7 @@ if __name__ == '__main__':
|
|||
from math import isclose
|
||||
from operator import add, sub, mul, truediv
|
||||
from itertools import repeat
|
||||
import doctest
|
||||
|
||||
g1 = NormalDist(10, 20)
|
||||
g2 = NormalDist(-5, 25)
|
||||
|
@ -893,3 +899,5 @@ if __name__ == '__main__':
|
|||
S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n),
|
||||
Y.samples(n))])
|
||||
assert_close(X - Y, S)
|
||||
|
||||
print(doctest.testmod())
|
||||
|
|
|
@ -1769,7 +1769,7 @@ class TestMode(NumericTestCase, AverageMixin, UnivariateTypeMixin):
|
|||
def test_range_data(self):
|
||||
# Override test from UnivariateCommonMixin.
|
||||
data = range(20, 50, 3)
|
||||
self.assertRaises(statistics.StatisticsError, self.func, data)
|
||||
self.assertEqual(self.func(data), 20)
|
||||
|
||||
def test_nominal_data(self):
|
||||
# Test mode with nominal data.
|
||||
|
@ -1790,13 +1790,14 @@ class TestMode(NumericTestCase, AverageMixin, UnivariateTypeMixin):
|
|||
# Test mode with bimodal data.
|
||||
data = [1, 1, 2, 2, 2, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 9, 9]
|
||||
assert data.count(2) == data.count(6) == 4
|
||||
# Check for an exception.
|
||||
self.assertRaises(statistics.StatisticsError, self.func, data)
|
||||
# mode() should return 2, the first encounted mode
|
||||
self.assertEqual(self.func(data), 2)
|
||||
|
||||
def test_unique_data_failure(self):
|
||||
# Test mode exception when data points are all unique.
|
||||
def test_unique_data(self):
|
||||
# Test mode when data points are all unique.
|
||||
data = list(range(10))
|
||||
self.assertRaises(statistics.StatisticsError, self.func, data)
|
||||
# mode() should return 0, the first encounted mode
|
||||
self.assertEqual(self.func(data), 0)
|
||||
|
||||
def test_none_data(self):
|
||||
# Test that mode raises TypeError if given None as data.
|
||||
|
@ -1809,8 +1810,18 @@ class TestMode(NumericTestCase, AverageMixin, UnivariateTypeMixin):
|
|||
# Test that a Counter is treated like any other iterable.
|
||||
data = collections.Counter([1, 1, 1, 2])
|
||||
# Since the keys of the counter are treated as data points, not the
|
||||
# counts, this should raise.
|
||||
self.assertRaises(statistics.StatisticsError, self.func, data)
|
||||
# counts, this should return the first mode encountered, 1
|
||||
self.assertEqual(self.func(data), 1)
|
||||
|
||||
|
||||
class TestMultiMode(unittest.TestCase):
|
||||
|
||||
def test_basics(self):
|
||||
multimode = statistics.multimode
|
||||
self.assertEqual(multimode('aabbbbbbbbcc'), ['b'])
|
||||
self.assertEqual(multimode('aabbbbccddddeeffffgg'), ['b', 'd', 'f'])
|
||||
self.assertEqual(multimode(''), [])
|
||||
|
||||
|
||||
class TestFMean(unittest.TestCase):
|
||||
|
||||
|
|
Loading…
Reference in New Issue