bpo-38308: Add optional weighting to statistics.harmonic_mean() (GH-23914)

This commit is contained in:
Raymond Hettinger 2020-12-23 19:52:09 -08:00 committed by GitHub
parent 6dd3da3cf4
commit cc3467a57b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 62 additions and 27 deletions

View File

@ -156,10 +156,11 @@ However, for reading convenience, most of the examples show sorted sequences.
.. versionadded:: 3.8 .. versionadded:: 3.8
.. function:: harmonic_mean(data) .. function:: harmonic_mean(data, weights=None)
Return the harmonic mean of *data*, a sequence or iterable of Return the harmonic mean of *data*, a sequence or iterable of
real-valued numbers. real-valued numbers. If *weights* is omitted or *None*, then
equal weighting is assumed.
The harmonic mean, sometimes called the subcontrary mean, is the The harmonic mean, sometimes called the subcontrary mean, is the
reciprocal of the arithmetic :func:`mean` of the reciprocals of the reciprocal of the arithmetic :func:`mean` of the reciprocals of the
@ -179,17 +180,17 @@ However, for reading convenience, most of the examples show sorted sequences.
>>> harmonic_mean([40, 60]) >>> harmonic_mean([40, 60])
48.0 48.0
Suppose an investor purchases an equal value of shares in each of Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
three companies, with P/E (price/earning) ratios of 2.5, 3 and 10. speeds-up to 60 km/hr for the remaining 30 km of the journey. What
What is the average P/E ratio for the investor's portfolio? is the average speed?
.. doctest:: .. doctest::
>>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio. >>> harmonic_mean([40, 60], weights=[5, 30])
3.6 56.0
:exc:`StatisticsError` is raised if *data* is empty, or any element :exc:`StatisticsError` is raised if *data* is empty, any element
is less than zero. is less than zero, or if the weighted sum isn't positive.
The current algorithm has an early-out when it encounters a zero The current algorithm has an early-out when it encounters a zero
in the input. This means that the subsequent inputs are not tested in the input. This means that the subsequent inputs are not tested
@ -197,6 +198,8 @@ However, for reading convenience, most of the examples show sorted sequences.
.. versionadded:: 3.6 .. versionadded:: 3.6
.. versionchanged:: 3.8
Added support for *weights*.
.. function:: median(data) .. function:: median(data)

View File

@ -106,7 +106,7 @@ import random
from fractions import Fraction from fractions import Fraction
from decimal import Decimal from decimal import Decimal
from itertools import groupby from itertools import groupby, repeat
from bisect import bisect_left, bisect_right from bisect import bisect_left, bisect_right
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
from operator import itemgetter from operator import itemgetter
@ -364,37 +364,37 @@ def geometric_mean(data):
' containing positive numbers') from None ' containing positive numbers') from None
def harmonic_mean(data): def harmonic_mean(data, weights=None):
"""Return the harmonic mean of data. """Return the harmonic mean of data.
The harmonic mean, sometimes called the subcontrary mean, is the The harmonic mean, sometimes called the subcontrary mean, is the
reciprocal of the arithmetic mean of the reciprocals of the data, reciprocal of the arithmetic mean of the reciprocals of the data,
and is often appropriate when averaging quantities which are rates and is often appropriate when averaging quantities which are rates
or ratios, for example speeds. Example: or ratios, for example speeds.
Suppose an investor purchases an equal value of shares in each of Suppose a car travels 40 km/hr for 5 km and then speeds-up to
three companies, with P/E (price/earning) ratios of 2.5, 3 and 10. 60 km/hr for another 5 km. What is the average speed?
What is the average P/E ratio for the investor's portfolio?
>>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio. >>> harmonic_mean([40, 60])
3.6 48.0
Using the arithmetic mean would give an average of about 5.167, which Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
is too high. speeds-up to 60 km/hr for the remaining 30 km of the journey. What
is the average speed?
>>> harmonic_mean([40, 60], weights=[5, 30])
56.0
If ``data`` is empty, or any element is less than zero, If ``data`` is empty, or any element is less than zero,
``harmonic_mean`` will raise ``StatisticsError``. ``harmonic_mean`` will raise ``StatisticsError``.
""" """
# For a justification for using harmonic mean for P/E ratios, see
# http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
# http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
if iter(data) is data: if iter(data) is data:
data = list(data) data = list(data)
errmsg = 'harmonic mean does not support negative values' errmsg = 'harmonic mean does not support negative values'
n = len(data) n = len(data)
if n < 1: if n < 1:
raise StatisticsError('harmonic_mean requires at least one data point') raise StatisticsError('harmonic_mean requires at least one data point')
elif n == 1: elif n == 1 and weights is None:
x = data[0] x = data[0]
if isinstance(x, (numbers.Real, Decimal)): if isinstance(x, (numbers.Real, Decimal)):
if x < 0: if x < 0:
@ -402,13 +402,23 @@ def harmonic_mean(data):
return x return x
else: else:
raise TypeError('unsupported type') raise TypeError('unsupported type')
if weights is None:
weights = repeat(1, n)
sum_weights = n
else:
if iter(weights) is weights:
weights = list(weights)
if len(weights) != n:
raise StatisticsError('Number of weights does not match data size')
_, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg))
try: try:
T, total, count = _sum(1 / x for x in _fail_neg(data, errmsg)) data = _fail_neg(data, errmsg)
T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
except ZeroDivisionError: except ZeroDivisionError:
return 0 return 0
assert count == n if total <= 0:
return _convert(n / total, T) raise StatisticsError('Weighted sum must be positive')
return _convert(sum_weights / total, T)
# FIXME: investigate ways to calculate medians without sorting? Quickselect? # FIXME: investigate ways to calculate medians without sorting? Quickselect?
def median(data): def median(data):

View File

@ -1599,6 +1599,27 @@ class TestHarmonicMean(NumericTestCase, AverageMixin, UnivariateTypeMixin):
actual = self.func(data*2) actual = self.func(data*2)
self.assertApproxEqual(actual, expected) self.assertApproxEqual(actual, expected)
def test_with_weights(self):
self.assertEqual(self.func([40, 60], [5, 30]), 56.0) # common case
self.assertEqual(self.func([40, 60],
weights=[5, 30]), 56.0) # keyword argument
self.assertEqual(self.func(iter([40, 60]),
iter([5, 30])), 56.0) # iterator inputs
self.assertEqual(
self.func([Fraction(10, 3), Fraction(23, 5), Fraction(7, 2)], [5, 2, 10]),
self.func([Fraction(10, 3)] * 5 +
[Fraction(23, 5)] * 2 +
[Fraction(7, 2)] * 10))
self.assertEqual(self.func([10], [7]), 10) # n=1 fast path
with self.assertRaises(TypeError):
self.func([1, 2, 3], [1, (), 3]) # non-numeric weight
with self.assertRaises(statistics.StatisticsError):
self.func([1, 2, 3], [1, 2]) # wrong number of weights
with self.assertRaises(statistics.StatisticsError):
self.func([10], [0]) # no non-zero weights
with self.assertRaises(statistics.StatisticsError):
self.func([10, 20], [0, 0]) # no non-zero weights
class TestMedian(NumericTestCase, AverageMixin): class TestMedian(NumericTestCase, AverageMixin):
# Common tests for median and all median.* functions. # Common tests for median and all median.* functions.

View File

@ -0,0 +1 @@
Add optional *weights* to *statistics.harmonic_mean()*.