bpo-36169 : Add overlap() method to statistics.NormalDist (GH-12149)
This commit is contained in:
parent
e942e7b5c9
commit
318d537daa
|
@ -549,6 +549,28 @@ of applications in statistics, including simulations and hypothesis testing.
|
||||||
compute the probability that a random variable *X* will be less than or
|
compute the probability that a random variable *X* will be less than or
|
||||||
equal to *x*. Mathematically, it is written ``P(X <= x)``.
|
equal to *x*. Mathematically, it is written ``P(X <= x)``.
|
||||||
|
|
||||||
|
.. method:: NormalDist.overlap(other)
|
||||||
|
|
||||||
|
Compute the `overlapping coefficient (OVL)
|
||||||
|
<http://www.iceaaonline.com/ready/wp-content/uploads/2014/06/MM-9-Presentation-Meet-the-Overlapping-Coefficient-A-Measure-for-Elevator-Speeches.pdf>`_
|
||||||
|
between two normal distributions.
|
||||||
|
|
||||||
|
Measures the agreement between two normal probability distributions.
|
||||||
|
Returns a value between 0.0 and 1.0 giving the overlapping area for
|
||||||
|
two probability density functions.
|
||||||
|
|
||||||
|
In this `example from John M. Linacre
|
||||||
|
<https://www.rasch.org/rmt/rmt101r.htm>`_ about 80% of each
|
||||||
|
distribution overlaps the other:
|
||||||
|
|
||||||
|
.. doctest::
|
||||||
|
|
||||||
|
>>> N1 = NormalDist(2.4, 1.6)
|
||||||
|
>>> N2 = NormalDist(3.2, 2.0)
|
||||||
|
>>> ovl = N1.overlap(N2)
|
||||||
|
>>> f'{ovl * 100.0 :.1f}%'
|
||||||
|
'80.4%'
|
||||||
|
|
||||||
Instances of :class:`NormalDist` support addition, subtraction,
|
Instances of :class:`NormalDist` support addition, subtraction,
|
||||||
multiplication and division by a constant. These operations
|
multiplication and division by a constant. These operations
|
||||||
are used for translation and scaling. For example:
|
are used for translation and scaling. For example:
|
||||||
|
@ -595,6 +617,16 @@ determine the percentage of students with scores between 1100 and 1200:
|
||||||
>>> f'{fraction * 100 :.1f}% score between 1100 and 1200'
|
>>> f'{fraction * 100 :.1f}% score between 1100 and 1200'
|
||||||
'18.2% score between 1100 and 1200'
|
'18.2% score between 1100 and 1200'
|
||||||
|
|
||||||
|
What percentage of men and women will have the same height in `two normally
|
||||||
|
distributed populations with known means and standard deviations
|
||||||
|
<http://www.usablestats.com/lessons/normal>`_?
|
||||||
|
|
||||||
|
>>> men = NormalDist(70, 4)
|
||||||
|
>>> women = NormalDist(65, 3.5)
|
||||||
|
>>> ovl = men.overlap(women)
|
||||||
|
>>> round(ovl * 100.0, 1)
|
||||||
|
50.3
|
||||||
|
|
||||||
To estimate the distribution for a model than isn't easy to solve
|
To estimate the distribution for a model than isn't easy to solve
|
||||||
analytically, :class:`NormalDist` can generate input samples for a `Monte
|
analytically, :class:`NormalDist` can generate input samples for a `Monte
|
||||||
Carlo simulation <https://en.wikipedia.org/wiki/Monte_Carlo_method>`_ of the
|
Carlo simulation <https://en.wikipedia.org/wiki/Monte_Carlo_method>`_ of the
|
||||||
|
|
|
@ -91,7 +91,7 @@ from fractions import Fraction
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from bisect import bisect_left, bisect_right
|
from bisect import bisect_left, bisect_right
|
||||||
from math import hypot, sqrt, fabs, exp, erf, tau
|
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -740,6 +740,41 @@ class NormalDist:
|
||||||
raise StatisticsError('cdf() not defined when sigma is zero')
|
raise StatisticsError('cdf() not defined when sigma is zero')
|
||||||
return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
|
return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
|
||||||
|
|
||||||
|
def overlap(self, other):
|
||||||
|
'''Compute the overlapping coefficient (OVL) between two normal distributions.
|
||||||
|
|
||||||
|
Measures the agreement between two normal probability distributions.
|
||||||
|
Returns a value between 0.0 and 1.0 giving the overlapping area in
|
||||||
|
the two underlying probability density functions.
|
||||||
|
|
||||||
|
>>> N1 = NormalDist(2.4, 1.6)
|
||||||
|
>>> N2 = NormalDist(3.2, 2.0)
|
||||||
|
>>> N1.overlap(N2)
|
||||||
|
0.8035050657330205
|
||||||
|
|
||||||
|
'''
|
||||||
|
# See: "The overlapping coefficient as a measure of agreement between
|
||||||
|
# probability distributions and point estimation of the overlap of two
|
||||||
|
# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
|
||||||
|
# http://dx.doi.org/10.1080/03610928908830127
|
||||||
|
if not isinstance(other, NormalDist):
|
||||||
|
raise TypeError('Expected another NormalDist instance')
|
||||||
|
X, Y = self, other
|
||||||
|
if (Y.sigma, Y.mu) < (X.sigma, X.mu): # sort to assure commutativity
|
||||||
|
X, Y = Y, X
|
||||||
|
X_var, Y_var = X.variance, Y.variance
|
||||||
|
if not X_var or not Y_var:
|
||||||
|
raise StatisticsError('overlap() not defined when sigma is zero')
|
||||||
|
dv = Y_var - X_var
|
||||||
|
dm = fabs(Y.mu - X.mu)
|
||||||
|
if not dv:
|
||||||
|
return 2.0 * NormalDist(dm, 2.0 * X.sigma).cdf(0)
|
||||||
|
a = X.mu * Y_var - Y.mu * X_var
|
||||||
|
b = X.sigma * Y.sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
|
||||||
|
x1 = (a + b) / dv
|
||||||
|
x2 = (a - b) / dv
|
||||||
|
return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def mean(self):
|
def mean(self):
|
||||||
'Arithmetic mean of the normal distribution'
|
'Arithmetic mean of the normal distribution'
|
||||||
|
|
|
@ -2162,6 +2162,68 @@ class TestNormalDist(unittest.TestCase):
|
||||||
self.assertEqual(X.cdf(float('Inf')), 1.0)
|
self.assertEqual(X.cdf(float('Inf')), 1.0)
|
||||||
self.assertTrue(math.isnan(X.cdf(float('NaN'))))
|
self.assertTrue(math.isnan(X.cdf(float('NaN'))))
|
||||||
|
|
||||||
|
def test_overlap(self):
|
||||||
|
NormalDist = statistics.NormalDist
|
||||||
|
|
||||||
|
# Match examples from Imman and Bradley
|
||||||
|
for X1, X2, published_result in [
|
||||||
|
(NormalDist(0.0, 2.0), NormalDist(1.0, 2.0), 0.80258),
|
||||||
|
(NormalDist(0.0, 1.0), NormalDist(1.0, 2.0), 0.60993),
|
||||||
|
]:
|
||||||
|
self.assertAlmostEqual(X1.overlap(X2), published_result, places=4)
|
||||||
|
self.assertAlmostEqual(X2.overlap(X1), published_result, places=4)
|
||||||
|
|
||||||
|
# Check against integration of the PDF
|
||||||
|
def overlap_numeric(X, Y, *, steps=8_192, z=5):
|
||||||
|
'Numerical integration cross-check for overlap() '
|
||||||
|
fsum = math.fsum
|
||||||
|
center = (X.mu + Y.mu) / 2.0
|
||||||
|
width = z * max(X.sigma, Y.sigma)
|
||||||
|
start = center - width
|
||||||
|
dx = 2.0 * width / steps
|
||||||
|
x_arr = [start + i*dx for i in range(steps)]
|
||||||
|
xp = list(map(X.pdf, x_arr))
|
||||||
|
yp = list(map(Y.pdf, x_arr))
|
||||||
|
total = max(fsum(xp), fsum(yp))
|
||||||
|
return fsum(map(min, xp, yp)) / total
|
||||||
|
|
||||||
|
for X1, X2 in [
|
||||||
|
# Examples from Imman and Bradley
|
||||||
|
(NormalDist(0.0, 2.0), NormalDist(1.0, 2.0)),
|
||||||
|
(NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)),
|
||||||
|
# Example from https://www.rasch.org/rmt/rmt101r.htm
|
||||||
|
(NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)),
|
||||||
|
# Gender heights from http://www.usablestats.com/lessons/normal
|
||||||
|
(NormalDist(70, 4), NormalDist(65, 3.5)),
|
||||||
|
# Misc cases with equal standard deviations
|
||||||
|
(NormalDist(100, 15), NormalDist(110, 15)),
|
||||||
|
(NormalDist(-100, 15), NormalDist(110, 15)),
|
||||||
|
(NormalDist(-100, 15), NormalDist(-110, 15)),
|
||||||
|
# Misc cases with unequal standard deviations
|
||||||
|
(NormalDist(100, 12), NormalDist(110, 15)),
|
||||||
|
(NormalDist(100, 12), NormalDist(150, 15)),
|
||||||
|
(NormalDist(100, 12), NormalDist(150, 35)),
|
||||||
|
# Misc cases with small values
|
||||||
|
(NormalDist(1.000, 0.002), NormalDist(1.001, 0.003)),
|
||||||
|
(NormalDist(1.000, 0.002), NormalDist(1.006, 0.0003)),
|
||||||
|
(NormalDist(1.000, 0.002), NormalDist(1.001, 0.099)),
|
||||||
|
]:
|
||||||
|
self.assertAlmostEqual(X1.overlap(X2), overlap_numeric(X1, X2), places=5)
|
||||||
|
self.assertAlmostEqual(X2.overlap(X1), overlap_numeric(X1, X2), places=5)
|
||||||
|
|
||||||
|
# Error cases
|
||||||
|
X = NormalDist()
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
X.overlap() # too few arguments
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
X.overlap(X, X) # too may arguments
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
X.overlap(None) # right operand not a NormalDist
|
||||||
|
with self.assertRaises(statistics.StatisticsError):
|
||||||
|
X.overlap(NormalDist(1, 0)) # right operand sigma is zero
|
||||||
|
with self.assertRaises(statistics.StatisticsError):
|
||||||
|
NormalDist(1, 0).overlap(X) # left operand sigma is zero
|
||||||
|
|
||||||
def test_properties(self):
|
def test_properties(self):
|
||||||
X = statistics.NormalDist(100, 15)
|
X = statistics.NormalDist(100, 15)
|
||||||
self.assertEqual(X.mean, 100)
|
self.assertEqual(X.mean, 100)
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Add overlap() method to statistics.NormalDist. Computes the overlapping
|
||||||
|
coefficient for two normal distributions.
|
Loading…
Reference in New Issue