From 457e4d1a516c2b83edeff2f255f4cd6e7b114feb Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 20:06:43 -0500 Subject: [PATCH] GH-102670: Use sumprod() to simplify, speed up, and improve accuracy of statistics functions (GH-102649) --- Lib/statistics.py | 26 ++++++++++--------- Lib/test/test_statistics.py | 12 ++++++++- ...-03-13-18-27-00.gh-issue-102670.GyoThv.rst | 2 ++ 3 files changed, 27 insertions(+), 13 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst diff --git a/Lib/statistics.py b/Lib/statistics.py index 7d5d750193a..6bd214bbfe2 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1036,7 +1036,7 @@ def covariance(x, y, /): raise StatisticsError('covariance requires at least two data points') xbar = fsum(x) / n ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) + sxy = sumprod((xi - xbar for xi in x), (yi - ybar for yi in y)) return sxy / (n - 1) @@ -1074,11 +1074,14 @@ def correlation(x, y, /, *, method='linear'): start = (n - 1) / -2 # Center rankings around zero x = _rank(x, start=start) y = _rank(y, start=start) - xbar = fsum(x) / n - ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) - sxx = fsum((d := xi - xbar) * d for xi in x) - syy = fsum((d := yi - ybar) * d for yi in y) + else: + xbar = fsum(x) / n + ybar = fsum(y) / n + x = [xi - xbar for xi in x] + y = [yi - ybar for yi in y] + sxy = sumprod(x, y) + sxx = sumprod(x, x) + syy = sumprod(y, y) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: @@ -1131,14 +1134,13 @@ def linear_regression(x, y, /, *, proportional=False): raise StatisticsError('linear regression requires that both inputs have same number of data points') if n < 2: raise StatisticsError('linear regression requires at least two data points') - if proportional: - sxy = fsum(xi * yi for xi, yi in zip(x, y)) - sxx = fsum(xi * xi for xi in x) - else: + if not proportional: xbar = fsum(x) / n ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) - sxx = fsum((d := xi - xbar) * d for xi in x) + x = [xi - xbar for xi in x] # List because used three times below + y = (yi - ybar for yi in y) # Generator because only used once below + sxy = sumprod(x, y) + 0.0 # Add zero to coerce result to a float + sxx = sumprod(x, x) try: slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) except ZeroDivisionError: diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 31a3cb6b53a..f0fa6454b1f 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -1,4 +1,4 @@ -"""Test suite for statistics module, including helper NumericTestCase and +x = """Test suite for statistics module, including helper NumericTestCase and approx_equal function. """ @@ -2610,6 +2610,16 @@ class TestLinearRegression(unittest.TestCase): self.assertAlmostEqual(slope, 20 + 1/150) self.assertEqual(intercept, 0.0) + def test_float_output(self): + x = [Fraction(2, 3), Fraction(3, 4)] + y = [Fraction(4, 5), Fraction(5, 6)] + slope, intercept = statistics.linear_regression(x, y) + self.assertTrue(isinstance(slope, float)) + self.assertTrue(isinstance(intercept, float)) + slope, intercept = statistics.linear_regression(x, y, proportional=True) + self.assertTrue(isinstance(slope, float)) + self.assertTrue(isinstance(intercept, float)) + class TestNormalDist: # General note on precision: The pdf(), cdf(), and overlap() methods diff --git a/Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst b/Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst new file mode 100644 index 00000000000..3de09f86754 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst @@ -0,0 +1,2 @@ +Optimized fmean(), correlation(), covariance(), and linear_regression() +using the new math.sumprod() function.