From cc3467a57b61b0e7ef254b36790a1c44b13f2228 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 23 Dec 2020 19:52:09 -0800 Subject: [PATCH] bpo-38308: Add optional weighting to statistics.harmonic_mean() (GH-23914) --- Doc/library/statistics.rst | 21 +++++---- Lib/statistics.py | 46 +++++++++++-------- Lib/test/test_statistics.py | 21 +++++++++ .../2020-12-23-15-16-12.bpo-38308.lB4Sv0.rst | 1 + 4 files changed, 62 insertions(+), 27 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2020-12-23-15-16-12.bpo-38308.lB4Sv0.rst diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 38a499ab37e..6467704006d 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -156,10 +156,11 @@ However, for reading convenience, most of the examples show sorted sequences. .. versionadded:: 3.8 -.. function:: harmonic_mean(data) +.. function:: harmonic_mean(data, weights=None) Return the harmonic mean of *data*, a sequence or iterable of - real-valued numbers. + real-valued numbers. If *weights* is omitted or *None*, then + equal weighting is assumed. The harmonic mean, sometimes called the subcontrary mean, is the reciprocal of the arithmetic :func:`mean` of the reciprocals of the @@ -179,17 +180,17 @@ However, for reading convenience, most of the examples show sorted sequences. >>> harmonic_mean([40, 60]) 48.0 - Suppose an investor purchases an equal value of shares in each of - three companies, with P/E (price/earning) ratios of 2.5, 3 and 10. - What is the average P/E ratio for the investor's portfolio? + Suppose a car travels 40 km/hr for 5 km, and when traffic clears, + speeds-up to 60 km/hr for the remaining 30 km of the journey. What + is the average speed? .. doctest:: - >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio. - 3.6 + >>> harmonic_mean([40, 60], weights=[5, 30]) + 56.0 - :exc:`StatisticsError` is raised if *data* is empty, or any element - is less than zero. + :exc:`StatisticsError` is raised if *data* is empty, any element + is less than zero, or if the weighted sum isn't positive. The current algorithm has an early-out when it encounters a zero in the input. This means that the subsequent inputs are not tested @@ -197,6 +198,8 @@ However, for reading convenience, most of the examples show sorted sequences. .. versionadded:: 3.6 + .. versionchanged:: 3.8 + Added support for *weights*. .. function:: median(data) diff --git a/Lib/statistics.py b/Lib/statistics.py index f9d3802ec5f..4b054b96114 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -106,7 +106,7 @@ import random from fractions import Fraction from decimal import Decimal -from itertools import groupby +from itertools import groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum from operator import itemgetter @@ -364,37 +364,37 @@ def geometric_mean(data): ' containing positive numbers') from None -def harmonic_mean(data): +def harmonic_mean(data, weights=None): """Return the harmonic mean of data. The harmonic mean, sometimes called the subcontrary mean, is the reciprocal of the arithmetic mean of the reciprocals of the data, and is often appropriate when averaging quantities which are rates - or ratios, for example speeds. Example: + or ratios, for example speeds. - Suppose an investor purchases an equal value of shares in each of - three companies, with P/E (price/earning) ratios of 2.5, 3 and 10. - What is the average P/E ratio for the investor's portfolio? + Suppose a car travels 40 km/hr for 5 km and then speeds-up to + 60 km/hr for another 5 km. What is the average speed? - >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio. - 3.6 + >>> harmonic_mean([40, 60]) + 48.0 - Using the arithmetic mean would give an average of about 5.167, which - is too high. + Suppose a car travels 40 km/hr for 5 km, and when traffic clears, + speeds-up to 60 km/hr for the remaining 30 km of the journey. What + is the average speed? + + >>> harmonic_mean([40, 60], weights=[5, 30]) + 56.0 If ``data`` is empty, or any element is less than zero, ``harmonic_mean`` will raise ``StatisticsError``. """ - # For a justification for using harmonic mean for P/E ratios, see - # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/ - # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087 if iter(data) is data: data = list(data) errmsg = 'harmonic mean does not support negative values' n = len(data) if n < 1: raise StatisticsError('harmonic_mean requires at least one data point') - elif n == 1: + elif n == 1 and weights is None: x = data[0] if isinstance(x, (numbers.Real, Decimal)): if x < 0: @@ -402,13 +402,23 @@ def harmonic_mean(data): return x else: raise TypeError('unsupported type') + if weights is None: + weights = repeat(1, n) + sum_weights = n + else: + if iter(weights) is weights: + weights = list(weights) + if len(weights) != n: + raise StatisticsError('Number of weights does not match data size') + _, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg)) try: - T, total, count = _sum(1 / x for x in _fail_neg(data, errmsg)) + data = _fail_neg(data, errmsg) + T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data)) except ZeroDivisionError: return 0 - assert count == n - return _convert(n / total, T) - + if total <= 0: + raise StatisticsError('Weighted sum must be positive') + return _convert(sum_weights / total, T) # FIXME: investigate ways to calculate medians without sorting? Quickselect? def median(data): diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 997110732a1..4b8686b6818 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -1599,6 +1599,27 @@ class TestHarmonicMean(NumericTestCase, AverageMixin, UnivariateTypeMixin): actual = self.func(data*2) self.assertApproxEqual(actual, expected) + def test_with_weights(self): + self.assertEqual(self.func([40, 60], [5, 30]), 56.0) # common case + self.assertEqual(self.func([40, 60], + weights=[5, 30]), 56.0) # keyword argument + self.assertEqual(self.func(iter([40, 60]), + iter([5, 30])), 56.0) # iterator inputs + self.assertEqual( + self.func([Fraction(10, 3), Fraction(23, 5), Fraction(7, 2)], [5, 2, 10]), + self.func([Fraction(10, 3)] * 5 + + [Fraction(23, 5)] * 2 + + [Fraction(7, 2)] * 10)) + self.assertEqual(self.func([10], [7]), 10) # n=1 fast path + with self.assertRaises(TypeError): + self.func([1, 2, 3], [1, (), 3]) # non-numeric weight + with self.assertRaises(statistics.StatisticsError): + self.func([1, 2, 3], [1, 2]) # wrong number of weights + with self.assertRaises(statistics.StatisticsError): + self.func([10], [0]) # no non-zero weights + with self.assertRaises(statistics.StatisticsError): + self.func([10, 20], [0, 0]) # no non-zero weights + class TestMedian(NumericTestCase, AverageMixin): # Common tests for median and all median.* functions. diff --git a/Misc/NEWS.d/next/Library/2020-12-23-15-16-12.bpo-38308.lB4Sv0.rst b/Misc/NEWS.d/next/Library/2020-12-23-15-16-12.bpo-38308.lB4Sv0.rst new file mode 100644 index 00000000000..cf3807d9dc4 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-12-23-15-16-12.bpo-38308.lB4Sv0.rst @@ -0,0 +1 @@ +Add optional *weights* to *statistics.harmonic_mean()*.