From 09aa6f914dc313875ff18474770a0a7c13ea8dea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tymoteusz=20Wo=C5=82od=C5=BAko?= Date: Sun, 25 Apr 2021 13:45:09 +0200 Subject: [PATCH] bpo-38490: statistics: Add covariance, Pearson's correlation, and simple linear regression (#16813) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tymoteusz Wołodźko >> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] + >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] + >>> covariance(x, y) + 0.75 + >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1] + >>> covariance(x, z) + -7.5 + >>> covariance(z, x) + -7.5 + + .. versionadded:: 3.10 + +.. function:: correlation(x, y, /) + + Return the `Pearson's correlation coefficient + `_ + for two inputs. Pearson's correlation coefficient *r* takes values + between -1 and +1. It measures the strength and direction of the linear + relationship, where +1 means very strong, positive linear relationship, + -1 very strong, negative linear relationship, and 0 no linear relationship. + + Both inputs must be of the same length (no less than two), and need + not to be constant, otherwise :exc:`StatisticsError` is raised. + + Examples: + + .. doctest:: + + >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] + >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1] + >>> correlation(x, x) + 1.0 + >>> correlation(x, y) + -1.0 + + .. versionadded:: 3.10 + +.. function:: linear_regression(regressor, dependent_variable) + + Return the intercept and slope of `simple linear regression + `_ + parameters estimated using ordinary least squares. Simple linear + regression describes relationship between *regressor* and + *dependent variable* in terms of linear function: + + *dependent_variable = intercept + slope \* regressor + noise* + + where ``intercept`` and ``slope`` are the regression parameters that are + estimated, and noise term is an unobserved random variable, for the + variability of the data that was not explained by the linear regression + (it is equal to the difference between prediction and the actual values + of dependent variable). + + Both inputs must be of the same length (no less than two), and regressor + needs not to be constant, otherwise :exc:`StatisticsError` is raised. + + For example, if we took the data on the data on `release dates of the Monty + Python films `_, and used + it to predict the cumulative number of Monty Python films produced, we could + predict what would be the number of films they could have made till year + 2019, assuming that they kept the pace. + + .. doctest:: + + >>> year = [1971, 1975, 1979, 1982, 1983] + >>> films_total = [1, 2, 3, 4, 5] + >>> intercept, slope = linear_regression(year, films_total) + >>> round(intercept + slope * 2019) + 16 + + We could also use it to "predict" how many Monty Python films existed when + Brian Cohen was born. + + .. doctest:: + + >>> round(intercept + slope * 1) + -610 + + .. versionadded:: 3.10 + Exceptions ---------- diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst index ab0b46f9f2d..1d9c03c439f 100644 --- a/Doc/whatsnew/3.10.rst +++ b/Doc/whatsnew/3.10.rst @@ -1051,6 +1051,14 @@ The :mod:`shelve` module now uses :data:`pickle.DEFAULT_PROTOCOL` by default instead of :mod:`pickle` protocol ``3`` when creating shelves. (Contributed by Zackery Spytz in :issue:`34204`.) +statistics +---------- + +Added :func:`~statistics.covariance`, Pearson's +:func:`~statistics.correlation`, and simple +:func:`~statistics.linear_regression` functions. +(Contributed by Tymoteusz Wołodźko in :issue:`38490`.) + site ---- diff --git a/Lib/statistics.py b/Lib/statistics.py index 2414869a7e6..673a162b3a7 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -73,6 +73,30 @@ second argument to the four "spread" functions to avoid recalculating it: 2.5 +Statistics for relations between two inputs +------------------------------------------- + +================== ==================================================== +Function Description +================== ==================================================== +covariance Sample covariance for two variables. +correlation Pearson's correlation coefficient for two variables. +linear_regression Intercept and slope for simple linear regression. +================== ==================================================== + +Calculate covariance, Pearson's correlation, and simple linear regression +for two inputs: + +>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] +>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] +>>> covariance(x, y) +0.75 +>>> correlation(x, y) #doctest: +ELLIPSIS +0.31622776601... +>>> linear_regression(x, y) #doctest: +LinearRegression(intercept=1.5, slope=0.1) + + Exceptions ---------- @@ -98,6 +122,9 @@ __all__ = [ 'quantiles', 'stdev', 'variance', + 'correlation', + 'covariance', + 'linear_regression', ] import math @@ -110,7 +137,7 @@ from itertools import groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum from operator import itemgetter -from collections import Counter +from collections import Counter, namedtuple # === Exceptions === @@ -826,6 +853,113 @@ def pstdev(data, mu=None): return math.sqrt(var) +# === Statistics for relations between two inputs === + +# See https://en.wikipedia.org/wiki/Covariance +# https://en.wikipedia.org/wiki/Pearson_correlation_coefficient +# https://en.wikipedia.org/wiki/Simple_linear_regression + + +def covariance(x, y, /): + """Covariance + + Return the sample covariance of two inputs *x* and *y*. Covariance + is a measure of the joint variability of two inputs. + + >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] + >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] + >>> covariance(x, y) + 0.75 + >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1] + >>> covariance(x, z) + -7.5 + >>> covariance(z, x) + -7.5 + + """ + n = len(x) + if len(y) != n: + raise StatisticsError('covariance requires that both inputs have same number of data points') + if n < 2: + raise StatisticsError('covariance requires at least two data points') + xbar = mean(x) + ybar = mean(y) + total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) + return total / (n - 1) + + +def correlation(x, y, /): + """Pearson's correlation coefficient + + Return the Pearson's correlation coefficient for two inputs. Pearson's + correlation coefficient *r* takes values between -1 and +1. It measures the + strength and direction of the linear relationship, where +1 means very + strong, positive linear relationship, -1 very strong, negative linear + relationship, and 0 no linear relationship. + + >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] + >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1] + >>> correlation(x, x) + 1.0 + >>> correlation(x, y) + -1.0 + + """ + n = len(x) + if len(y) != n: + raise StatisticsError('correlation requires that both inputs have same number of data points') + if n < 2: + raise StatisticsError('correlation requires at least two data points') + cov = covariance(x, y) + stdx = stdev(x) + stdy = stdev(y) + try: + return cov / (stdx * stdy) + except ZeroDivisionError: + raise StatisticsError('at least one of the inputs is constant') + + +LinearRegression = namedtuple('LinearRegression', ['intercept', 'slope']) + + +def linear_regression(regressor, dependent_variable, /): + """Intercept and slope for simple linear regression + + Return the intercept and slope of simple linear regression + parameters estimated using ordinary least squares. Simple linear + regression describes relationship between *regressor* and + *dependent variable* in terms of linear function:: + + dependent_variable = intercept + slope * regressor + noise + + where ``intercept`` and ``slope`` are the regression parameters that are + estimated, and noise term is an unobserved random variable, for the + variability of the data that was not explained by the linear regression + (it is equal to the difference between prediction and the actual values + of dependent variable). + + The parameters are returned as a named tuple. + + >>> regressor = [1, 2, 3, 4, 5] + >>> noise = NormalDist().samples(5, seed=42) + >>> dependent_variable = [2 + 3 * regressor[i] + noise[i] for i in range(5)] + >>> linear_regression(regressor, dependent_variable) #doctest: +ELLIPSIS + LinearRegression(intercept=1.75684970486..., slope=3.09078914170...) + + """ + n = len(regressor) + if len(dependent_variable) != n: + raise StatisticsError('linear regression requires that both inputs have same number of data points') + if n < 2: + raise StatisticsError('linear regression requires at least two data points') + try: + slope = covariance(regressor, dependent_variable) / variance(regressor) + except ZeroDivisionError: + raise StatisticsError('regressor is constant') + intercept = mean(dependent_variable) - slope * mean(regressor) + return LinearRegression(intercept=intercept, slope=slope) + + ## Normal Distribution ##################################################### diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 4b8686b6818..70d269dea73 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2407,6 +2407,84 @@ class TestQuantiles(unittest.TestCase): quantiles([10, None, 30], n=4) # data is non-numeric +class TestBivariateStatistics(unittest.TestCase): + + def test_unequal_size_error(self): + for x, y in [ + ([1, 2, 3], [1, 2]), + ([1, 2], [1, 2, 3]), + ]: + with self.assertRaises(statistics.StatisticsError): + statistics.covariance(x, y) + with self.assertRaises(statistics.StatisticsError): + statistics.correlation(x, y) + with self.assertRaises(statistics.StatisticsError): + statistics.linear_regression(x, y) + + def test_small_sample_error(self): + for x, y in [ + ([], []), + ([], [1, 2,]), + ([1, 2,], []), + ([1,], [1,]), + ([1,], [1, 2,]), + ([1, 2,], [1,]), + ]: + with self.assertRaises(statistics.StatisticsError): + statistics.covariance(x, y) + with self.assertRaises(statistics.StatisticsError): + statistics.correlation(x, y) + with self.assertRaises(statistics.StatisticsError): + statistics.linear_regression(x, y) + + +class TestCorrelationAndCovariance(unittest.TestCase): + + def test_results(self): + for x, y, result in [ + ([1, 2, 3], [1, 2, 3], 1), + ([1, 2, 3], [-1, -2, -3], -1), + ([1, 2, 3], [3, 2, 1], -1), + ([1, 2, 3], [1, 2, 1], 0), + ([1, 2, 3], [1, 3, 2], 0.5), + ]: + self.assertAlmostEqual(statistics.correlation(x, y), result) + self.assertAlmostEqual(statistics.covariance(x, y), result) + + def test_different_scales(self): + x = [1, 2, 3] + y = [10, 30, 20] + self.assertAlmostEqual(statistics.correlation(x, y), 0.5) + self.assertAlmostEqual(statistics.covariance(x, y), 5) + + y = [.1, .2, .3] + self.assertAlmostEqual(statistics.correlation(x, y), 1) + self.assertAlmostEqual(statistics.covariance(x, y), 0.1) + + +class TestLinearRegression(unittest.TestCase): + + def test_constant_input_error(self): + x = [1, 1, 1,] + y = [1, 2, 3,] + with self.assertRaises(statistics.StatisticsError): + statistics.linear_regression(x, y) + + def test_results(self): + for x, y, true_intercept, true_slope in [ + ([1, 2, 3], [0, 0, 0], 0, 0), + ([1, 2, 3], [1, 2, 3], 0, 1), + ([1, 2, 3], [100, 100, 100], 100, 0), + ([1, 2, 3], [12, 14, 16], 10, 2), + ([1, 2, 3], [-1, -2, -3], 0, -1), + ([1, 2, 3], [21, 22, 23], 20, 1), + ([1, 2, 3], [5.1, 5.2, 5.3], 5, 0.1), + ]: + intercept, slope = statistics.linear_regression(x, y) + self.assertAlmostEqual(intercept, true_intercept) + self.assertAlmostEqual(slope, true_slope) + + class TestNormalDist: # General note on precision: The pdf(), cdf(), and overlap() methods diff --git a/Misc/ACKS b/Misc/ACKS index 760d6c79835..7d9af855a74 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1927,6 +1927,7 @@ David Wolever Klaus-Juergen Wolf Dan Wolfe Richard Wolff +Tymoteusz Wołodźko Adam Woodbeck William Woodruff Steven Work diff --git a/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst b/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst new file mode 100644 index 00000000000..82b9e33be0e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-10-16-08-08-14.bpo-38490.QbDXEF.rst @@ -0,0 +1 @@ +Covariance, Pearson's correlation, and simple linear regression functionality was added to statistics module. Patch by Tymoteusz Wołodźko. \ No newline at end of file