bpo-36169 : Add overlap() method to statistics.NormalDist (GH-12149)

2019-03-06 22:59:40 -08:00 · 2019-03-06 22:59:40 -08:00 · 318d537daa
parent e942e7b5c9
commit 318d537daa
4 changed files with 132 additions and 1 deletions
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@ -549,6 +549,28 @@ of applications in statistics, including simulations and hypothesis testing.
       compute the probability that a random variable *X* will be less than or
       equal to *x*.  Mathematically, it is written ``P(X <= x)``.
    .. method:: NormalDist.overlap(other)
       Compute the `overlapping coefficient (OVL)
       <http://www.iceaaonline.com/ready/wp-content/uploads/2014/06/MM-9-Presentation-Meet-the-Overlapping-Coefficient-A-Measure-for-Elevator-Speeches.pdf>`_
       between two normal distributions.
       Measures the agreement between two normal probability distributions.
       Returns a value between 0.0 and 1.0 giving the overlapping area for
       two probability density functions.
       In this `example from John M. Linacre
       <https://www.rasch.org/rmt/rmt101r.htm>`_ about 80% of each
       distribution overlaps the other:
       .. doctest::
           >>> N1 = NormalDist(2.4, 1.6)
           >>> N2 = NormalDist(3.2, 2.0)
           >>> ovl = N1.overlap(N2)
           >>> f'{ovl * 100.0 :.1f}%'
           '80.4%'
    Instances of :class:`NormalDist` support addition, subtraction,
    multiplication and division by a constant.  These operations
    are used for translation and scaling.  For example:
@ -595,6 +617,16 @@ determine the percentage of students with scores between 1100 and 1200:
    >>> f'{fraction * 100 :.1f}% score between 1100 and 1200'
    '18.2% score between 1100 and 1200'
 What percentage of men and women will have the same height in `two normally
 distributed populations with known means and standard deviations
 <http://www.usablestats.com/lessons/normal>`_?
    >>> men = NormalDist(70, 4)
    >>> women = NormalDist(65, 3.5)
    >>> ovl = men.overlap(women)
    >>> round(ovl * 100.0, 1)
    50.3
 To estimate the distribution for a model than isn't easy to solve
 analytically, :class:`NormalDist` can generate input samples for a `Monte
 Carlo simulation <https://en.wikipedia.org/wiki/Monte_Carlo_method>`_ of the
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@ -91,7 +91,7 @@ from fractions import Fraction
 from decimal import Decimal
 from itertools import groupby
 from bisect import bisect_left, bisect_right
-from math import hypot, sqrt, fabs, exp, erf, tau
+from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
@ -740,6 +740,41 @@ class NormalDist:
            raise StatisticsError('cdf() not defined when sigma is zero')
        return 0.5 * (1.0 + erf((x - self.mu) / (self.sigma * sqrt(2.0))))
    def overlap(self, other):
        '''Compute the overlapping coefficient (OVL) between two normal distributions.
        Measures the agreement between two normal probability distributions.
        Returns a value between 0.0 and 1.0 giving the overlapping area in
        the two underlying probability density functions.
            >>> N1 = NormalDist(2.4, 1.6)
            >>> N2 = NormalDist(3.2, 2.0)
            >>> N1.overlap(N2)
            0.8035050657330205
        '''
        # See: "The overlapping coefficient as a measure of agreement between
        # probability distributions and point estimation of the overlap of two
        # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
        # http://dx.doi.org/10.1080/03610928908830127
        if not isinstance(other, NormalDist):
            raise TypeError('Expected another NormalDist instance')
        X, Y = self, other
        if (Y.sigma, Y.mu) < (X.sigma, X.mu):   # sort to assure commutativity
            X, Y = Y, X
        X_var, Y_var = X.variance, Y.variance
        if not X_var or not Y_var:
            raise StatisticsError('overlap() not defined when sigma is zero')
        dv = Y_var - X_var
        dm = fabs(Y.mu - X.mu)
        if not dv:
            return 2.0 * NormalDist(dm, 2.0 * X.sigma).cdf(0)
        a = X.mu * Y_var - Y.mu * X_var
        b = X.sigma * Y.sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
        x1 = (a + b) / dv
        x2 = (a - b) / dv
        return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
    @property
    def mean(self):
        'Arithmetic mean of the normal distribution'
--- a/Lib/test/test_statistics.py
+++ b/Lib/test/test_statistics.py
@ -2162,6 +2162,68 @@ class TestNormalDist(unittest.TestCase):
        self.assertEqual(X.cdf(float('Inf')), 1.0)
        self.assertTrue(math.isnan(X.cdf(float('NaN'))))
    def test_overlap(self):
        NormalDist = statistics.NormalDist
        # Match examples from Imman and Bradley
        for X1, X2, published_result in [
                (NormalDist(0.0, 2.0), NormalDist(1.0, 2.0), 0.80258),
                (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0), 0.60993),
            ]:
            self.assertAlmostEqual(X1.overlap(X2), published_result, places=4)
            self.assertAlmostEqual(X2.overlap(X1), published_result, places=4)
        # Check against integration of the PDF
        def overlap_numeric(X, Y, *, steps=8_192, z=5):
            'Numerical integration cross-check for overlap() '
            fsum = math.fsum
            center = (X.mu + Y.mu) / 2.0
            width = z * max(X.sigma, Y.sigma)
            start = center - width
            dx = 2.0 * width / steps
            x_arr = [start + i*dx for i in range(steps)]
            xp = list(map(X.pdf, x_arr))
            yp = list(map(Y.pdf, x_arr))
            total = max(fsum(xp), fsum(yp))
            return fsum(map(min, xp, yp)) / total
        for X1, X2 in [
                # Examples from Imman and Bradley
                (NormalDist(0.0, 2.0), NormalDist(1.0, 2.0)),
                (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)),
                # Example from https://www.rasch.org/rmt/rmt101r.htm
                (NormalDist(0.0, 1.0), NormalDist(1.0, 2.0)),
                # Gender heights from http://www.usablestats.com/lessons/normal
                (NormalDist(70, 4), NormalDist(65, 3.5)),
                # Misc cases with equal standard deviations
                (NormalDist(100, 15), NormalDist(110, 15)),
                (NormalDist(-100, 15), NormalDist(110, 15)),
                (NormalDist(-100, 15), NormalDist(-110, 15)),
                # Misc cases with unequal standard deviations
                (NormalDist(100, 12), NormalDist(110, 15)),
                (NormalDist(100, 12), NormalDist(150, 15)),
                (NormalDist(100, 12), NormalDist(150, 35)),
                # Misc cases with small values
                (NormalDist(1.000, 0.002), NormalDist(1.001, 0.003)),
                (NormalDist(1.000, 0.002), NormalDist(1.006, 0.0003)),
                (NormalDist(1.000, 0.002), NormalDist(1.001, 0.099)),
            ]:
            self.assertAlmostEqual(X1.overlap(X2), overlap_numeric(X1, X2), places=5)
            self.assertAlmostEqual(X2.overlap(X1), overlap_numeric(X1, X2), places=5)
        # Error cases
        X = NormalDist()
        with self.assertRaises(TypeError):
            X.overlap()                             # too few arguments
        with self.assertRaises(TypeError):
            X.overlap(X, X)                         # too may arguments
        with self.assertRaises(TypeError):
            X.overlap(None)                         # right operand not a NormalDist
        with self.assertRaises(statistics.StatisticsError):
            X.overlap(NormalDist(1, 0))             # right operand sigma is zero
        with self.assertRaises(statistics.StatisticsError):
            NormalDist(1, 0).overlap(X)             # left operand sigma is zero
    def test_properties(self):
        X = statistics.NormalDist(100, 15)
        self.assertEqual(X.mean, 100)
--- a/Misc/NEWS.d/next/Library/2019-03-03-11-37-09.bpo-36169.8nWJy7.rst
+++ b/Misc/NEWS.d/next/Library/2019-03-03-11-37-09.bpo-36169.8nWJy7.rst
@ -0,0 +1,2 @@
 Add overlap() method to statistics.NormalDist.  Computes the overlapping
 coefficient for two normal distributions.
		`@ -0,0 +1,2 @@`
							`Add overlap() method to statistics.NormalDist. Computes the overlapping`
							`coefficient for two normal distributions.`