Add harmonic mean and tests.

This commit is contained in:
Steven D'Aprano 2016-08-09 12:49:01 +10:00
parent 95e0df8389
commit a474afdddc
2 changed files with 211 additions and 14 deletions

View File

@ -28,6 +28,7 @@ Calculating averages
Function Description
================== =============================================
mean Arithmetic mean (average) of data.
harmonic_mean Harmonic mean of data.
median Median (middle value) of data.
median_low Low median of data.
median_high High median of data.
@ -95,16 +96,17 @@ A single exception is defined: StatisticsError is a subclass of ValueError.
__all__ = [ 'StatisticsError',
'pstdev', 'pvariance', 'stdev', 'variance',
'median', 'median_low', 'median_high', 'median_grouped',
'mean', 'mode',
'mean', 'mode', 'harmonic_mean',
]
import collections
import decimal
import math
import numbers
from fractions import Fraction
from decimal import Decimal
from itertools import groupby
from itertools import groupby, chain
from bisect import bisect_left, bisect_right
@ -135,7 +137,8 @@ def _sum(data, start=0):
Some sources of round-off error will be avoided:
>>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero.
# Built-in sum returns zero.
>>> _sum([1e50, 1, -1e50] * 1000)
(<class 'float'>, Fraction(1000, 1), 3000)
Fractions and Decimals are also supported:
@ -291,6 +294,15 @@ def _find_rteq(a, l, x):
return i-1
raise ValueError
def _fail_neg(values, errmsg='negative value'):
"""Iterate over values, failing if any are less than zero."""
for x in values:
if x < 0:
raise StatisticsError(errmsg)
yield x
# === Measures of central tendency (averages) ===
def mean(data):
@ -319,6 +331,52 @@ def mean(data):
return _convert(total/n, T)
def harmonic_mean(data):
"""Return the harmonic mean of data.
The harmonic mean, sometimes called the subcontrary mean, is the
reciprocal of the arithmetic mean of the reciprocals of the data,
and is often appropriate when averaging quantities which are rates
or ratios, for example speeds. Example:
Suppose an investor purchases an equal value of shares in each of
three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
What is the average P/E ratio for the investor's portfolio?
>>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
3.6
Using the arithmetic mean would give an average of about 5.167, which
is too high.
If ``data`` is empty, or any element is less than zero,
``harmonic_mean`` will raise ``StatisticsError``.
"""
# For a justification for using harmonic mean for P/E ratios, see
# http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
# http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
if iter(data) is data:
data = list(data)
errmsg = 'harmonic mean does not support negative values'
n = len(data)
if n < 1:
raise StatisticsError('harmonic_mean requires at least one data point')
elif n == 1:
x = data[0]
if isinstance(x, (numbers.Real, Decimal)):
if x < 0:
raise StatisticsError(errmsg)
return x
else:
raise TypeError('unsupported type')
try:
T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
except ZeroDivisionError:
return 0
assert count == n
return _convert(n/total, T)
# FIXME: investigate ways to calculate medians without sorting? Quickselect?
def median(data):
"""Return the median (middle value) of numeric data.

View File

@ -21,6 +21,10 @@ import statistics
# === Helper functions and class ===
def sign(x):
"""Return -1.0 for negatives, including -0.0, otherwise +1.0."""
return math.copysign(1, x)
def _nan_equal(a, b):
"""Return True if a and b are both the same kind of NAN.
@ -264,6 +268,13 @@ class NumericTestCase(unittest.TestCase):
# === Test the helpers ===
# ========================
class TestSign(unittest.TestCase):
"""Test that the helper function sign() works correctly."""
def testZeroes(self):
# Test that signed zeroes report their sign correctly.
self.assertEqual(sign(0.0), +1)
self.assertEqual(sign(-0.0), -1)
# --- Tests for approx_equal ---
@ -659,7 +670,7 @@ class DocTests(unittest.TestCase):
@unittest.skipIf(sys.flags.optimize >= 2,
"Docstrings are omitted with -OO and above")
def test_doc_tests(self):
failed, tried = doctest.testmod(statistics)
failed, tried = doctest.testmod(statistics, optionflags=doctest.ELLIPSIS)
self.assertGreater(tried, 0)
self.assertEqual(failed, 0)
@ -971,6 +982,34 @@ class ConvertTest(unittest.TestCase):
self.assertTrue(_nan_equal(x, nan))
class FailNegTest(unittest.TestCase):
"""Test _fail_neg private function."""
def test_pass_through(self):
# Test that values are passed through unchanged.
values = [1, 2.0, Fraction(3), Decimal(4)]
new = list(statistics._fail_neg(values))
self.assertEqual(values, new)
def test_negatives_raise(self):
# Test that negatives raise an exception.
for x in [1, 2.0, Fraction(3), Decimal(4)]:
seq = [-x]
it = statistics._fail_neg(seq)
self.assertRaises(statistics.StatisticsError, next, it)
def test_error_msg(self):
# Test that a given error message is used.
msg = "badness #%d" % random.randint(10000, 99999)
try:
next(statistics._fail_neg([-1], msg))
except statistics.StatisticsError as e:
errmsg = e.args[0]
else:
self.fail("expected exception, but it didn't happen")
self.assertEqual(errmsg, msg)
# === Tests for public functions ===
class UnivariateCommonMixin:
@ -1082,13 +1121,13 @@ class UnivariateTypeMixin:
Not all tests to do with types need go in this class. Only those that
rely on the function returning the same type as its input data.
"""
def test_types_conserved(self):
# Test that functions keeps the same type as their data points.
# (Excludes mixed data types.) This only tests the type of the return
# result, not the value.
def prepare_types_for_conservation_test(self):
"""Return the types which are expected to be conserved."""
class MyFloat(float):
def __truediv__(self, other):
return type(self)(super().__truediv__(other))
def __rtruediv__(self, other):
return type(self)(super().__rtruediv__(other))
def __sub__(self, other):
return type(self)(super().__sub__(other))
def __rsub__(self, other):
@ -1098,9 +1137,14 @@ class UnivariateTypeMixin:
def __add__(self, other):
return type(self)(super().__add__(other))
__radd__ = __add__
return (float, Decimal, Fraction, MyFloat)
def test_types_conserved(self):
# Test that functions keeps the same type as their data points.
# (Excludes mixed data types.) This only tests the type of the return
# result, not the value.
data = self.prepare_data()
for kind in (float, Decimal, Fraction, MyFloat):
for kind in self.prepare_types_for_conservation_test():
d = [kind(x) for x in data]
result = self.func(d)
self.assertIs(type(result), kind)
@ -1275,10 +1319,14 @@ class AverageMixin(UnivariateCommonMixin):
for x in (23, 42.5, 1.3e15, Fraction(15, 19), Decimal('0.28')):
self.assertEqual(self.func([x]), x)
def prepare_values_for_repeated_single_test(self):
return (3.5, 17, 2.5e15, Fraction(61, 67), Decimal('4.9712'))
def test_repeated_single_value(self):
# The average of a single repeated value is the value itself.
for x in (3.5, 17, 2.5e15, Fraction(61, 67), Decimal('4.9712')):
for x in self.prepare_values_for_repeated_single_test():
for count in (2, 5, 10, 20):
with self.subTest(x=x, count=count):
data = [x]*count
self.assertEqual(self.func(data), x)
@ -1304,7 +1352,7 @@ class TestMean(NumericTestCase, AverageMixin, UnivariateTypeMixin):
self.assertEqual(self.func(data), 22.015625)
def test_decimals(self):
# Test mean with ints.
# Test mean with Decimals.
D = Decimal
data = [D("1.634"), D("2.517"), D("3.912"), D("4.072"), D("5.813")]
random.shuffle(data)
@ -1379,6 +1427,97 @@ class TestMean(NumericTestCase, AverageMixin, UnivariateTypeMixin):
self.assertEqual(statistics.mean([tiny]*n), tiny)
class TestHarmonicMean(NumericTestCase, AverageMixin, UnivariateTypeMixin):
def setUp(self):
self.func = statistics.harmonic_mean
def prepare_data(self):
# Override mixin method.
values = super().prepare_data()
values.remove(0)
return values
def prepare_values_for_repeated_single_test(self):
# Override mixin method.
return (3.5, 17, 2.5e15, Fraction(61, 67), Decimal('4.125'))
def test_zero(self):
# Test that harmonic mean returns zero when given zero.
values = [1, 0, 2]
self.assertEqual(self.func(values), 0)
def test_negative_error(self):
# Test that harmonic mean raises when given a negative value.
exc = statistics.StatisticsError
for values in ([-1], [1, -2, 3]):
with self.subTest(values=values):
self.assertRaises(exc, self.func, values)
def test_ints(self):
# Test harmonic mean with ints.
data = [2, 4, 4, 8, 16, 16]
random.shuffle(data)
self.assertEqual(self.func(data), 6*4/5)
def test_floats_exact(self):
# Test harmonic mean with some carefully chosen floats.
data = [1/8, 1/4, 1/4, 1/2, 1/2]
random.shuffle(data)
self.assertEqual(self.func(data), 1/4)
self.assertEqual(self.func([0.25, 0.5, 1.0, 1.0]), 0.5)
def test_singleton_lists(self):
# Test that harmonic mean([x]) returns (approximately) x.
for x in range(1, 101):
if x in (49, 93, 98, 99):
self.assertApproxEqual(self.func([x]), x, tol=2e-14)
else:
self.assertEqual(self.func([x]), x)
def test_decimals_exact(self):
# Test harmonic mean with some carefully chosen Decimals.
D = Decimal
self.assertEqual(self.func([D(15), D(30), D(60), D(60)]), D(30))
data = [D("0.05"), D("0.10"), D("0.20"), D("0.20")]
random.shuffle(data)
self.assertEqual(self.func(data), D("0.10"))
data = [D("1.68"), D("0.32"), D("5.94"), D("2.75")]
random.shuffle(data)
self.assertEqual(self.func(data), D(66528)/70723)
def test_fractions(self):
# Test harmonic mean with Fractions.
F = Fraction
data = [F(1, 2), F(2, 3), F(3, 4), F(4, 5), F(5, 6), F(6, 7), F(7, 8)]
random.shuffle(data)
self.assertEqual(self.func(data), F(7*420, 4029))
def test_inf(self):
# Test harmonic mean with infinity.
values = [2.0, float('inf'), 1.0]
self.assertEqual(self.func(values), 2.0)
def test_nan(self):
# Test harmonic mean with NANs.
values = [2.0, float('nan'), 1.0]
self.assertTrue(math.isnan(self.func(values)))
def test_multiply_data_points(self):
# Test multiplying every data point by a constant.
c = 111
data = [3.4, 4.5, 4.9, 6.7, 6.8, 7.2, 8.0, 8.1, 9.4]
expected = self.func(data)*c
result = self.func([x*c for x in data])
self.assertEqual(result, expected)
def test_doubled_data(self):
# Harmonic mean of [a,b...z] should be same as for [a,a,b,b...z,z].
data = [random.uniform(1, 5) for _ in range(1000)]
expected = self.func(data)
actual = self.func(data*2)
self.assertApproxEqual(actual, expected)
class TestMedian(NumericTestCase, AverageMixin):
# Common tests for median and all median.* functions.
def setUp(self):