gh-115532: Minor tweaks to kde() (gh-117897)

This commit is contained in:
Raymond Hettinger 2024-04-15 10:08:21 -05:00 committed by GitHub
parent 10f1a2687a
commit 0823f43618
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 25 additions and 12 deletions

View File

@ -1163,7 +1163,7 @@ accurately approximated inverse cumulative distribution function.
.. testcode:: .. testcode::
from random import choice, random, seed from random import choice, random, seed
from math import sqrt, log, pi, tan, asin from math import sqrt, log, pi, tan, asin, cos, acos
from statistics import NormalDist from statistics import NormalDist
kernel_invcdfs = { kernel_invcdfs = {
@ -1172,6 +1172,7 @@ accurately approximated inverse cumulative distribution function.
'sigmoid': lambda p: log(tan(p * pi/2)), 'sigmoid': lambda p: log(tan(p * pi/2)),
'rectangular': lambda p: 2*p - 1, 'rectangular': lambda p: 2*p - 1,
'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p), 'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p),
'parabolic': lambda p: 2 * cos((acos(2*p-1) + pi) / 3),
'cosine': lambda p: 2*asin(2*p - 1)/pi, 'cosine': lambda p: 2*asin(2*p - 1)/pi,
} }

View File

@ -919,13 +919,13 @@ def kde(data, h, kernel='normal', *, cumulative=False):
sqrt2pi = sqrt(2 * pi) sqrt2pi = sqrt(2 * pi)
sqrt2 = sqrt(2) sqrt2 = sqrt(2)
K = lambda t: exp(-1/2 * t * t) / sqrt2pi K = lambda t: exp(-1/2 * t * t) / sqrt2pi
I = lambda t: 1/2 * (1.0 + erf(t / sqrt2)) W = lambda t: 1/2 * (1.0 + erf(t / sqrt2))
support = None support = None
case 'logistic': case 'logistic':
# 1.0 / (exp(t) + 2.0 + exp(-t)) # 1.0 / (exp(t) + 2.0 + exp(-t))
K = lambda t: 1/2 / (1.0 + cosh(t)) K = lambda t: 1/2 / (1.0 + cosh(t))
I = lambda t: 1.0 - 1.0 / (exp(t) + 1.0) W = lambda t: 1.0 - 1.0 / (exp(t) + 1.0)
support = None support = None
case 'sigmoid': case 'sigmoid':
@ -933,39 +933,39 @@ def kde(data, h, kernel='normal', *, cumulative=False):
c1 = 1 / pi c1 = 1 / pi
c2 = 2 / pi c2 = 2 / pi
K = lambda t: c1 / cosh(t) K = lambda t: c1 / cosh(t)
I = lambda t: c2 * atan(exp(t)) W = lambda t: c2 * atan(exp(t))
support = None support = None
case 'rectangular' | 'uniform': case 'rectangular' | 'uniform':
K = lambda t: 1/2 K = lambda t: 1/2
I = lambda t: 1/2 * t + 1/2 W = lambda t: 1/2 * t + 1/2
support = 1.0 support = 1.0
case 'triangular': case 'triangular':
K = lambda t: 1.0 - abs(t) K = lambda t: 1.0 - abs(t)
I = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2 W = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2
support = 1.0 support = 1.0
case 'parabolic' | 'epanechnikov': case 'parabolic' | 'epanechnikov':
K = lambda t: 3/4 * (1.0 - t * t) K = lambda t: 3/4 * (1.0 - t * t)
I = lambda t: -1/4 * t**3 + 3/4 * t + 1/2 W = lambda t: -1/4 * t**3 + 3/4 * t + 1/2
support = 1.0 support = 1.0
case 'quartic' | 'biweight': case 'quartic' | 'biweight':
K = lambda t: 15/16 * (1.0 - t * t) ** 2 K = lambda t: 15/16 * (1.0 - t * t) ** 2
I = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2 W = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2
support = 1.0 support = 1.0
case 'triweight': case 'triweight':
K = lambda t: 35/32 * (1.0 - t * t) ** 3 K = lambda t: 35/32 * (1.0 - t * t) ** 3
I = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2 W = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2
support = 1.0 support = 1.0
case 'cosine': case 'cosine':
c1 = pi / 4 c1 = pi / 4
c2 = pi / 2 c2 = pi / 2
K = lambda t: c1 * cos(c2 * t) K = lambda t: c1 * cos(c2 * t)
I = lambda t: 1/2 * sin(c2 * t) + 1/2 W = lambda t: 1/2 * sin(c2 * t) + 1/2
support = 1.0 support = 1.0
case _: case _:
@ -974,10 +974,14 @@ def kde(data, h, kernel='normal', *, cumulative=False):
if support is None: if support is None:
def pdf(x): def pdf(x):
n = len(data)
return sum(K((x - x_i) / h) for x_i in data) / (n * h) return sum(K((x - x_i) / h) for x_i in data) / (n * h)
def cdf(x): def cdf(x):
return sum(I((x - x_i) / h) for x_i in data) / n
n = len(data)
return sum(W((x - x_i) / h) for x_i in data) / n
else: else:
@ -985,16 +989,24 @@ def kde(data, h, kernel='normal', *, cumulative=False):
bandwidth = h * support bandwidth = h * support
def pdf(x): def pdf(x):
nonlocal n, sample
if len(data) != n:
sample = sorted(data)
n = len(data)
i = bisect_left(sample, x - bandwidth) i = bisect_left(sample, x - bandwidth)
j = bisect_right(sample, x + bandwidth) j = bisect_right(sample, x + bandwidth)
supported = sample[i : j] supported = sample[i : j]
return sum(K((x - x_i) / h) for x_i in supported) / (n * h) return sum(K((x - x_i) / h) for x_i in supported) / (n * h)
def cdf(x): def cdf(x):
nonlocal n, sample
if len(data) != n:
sample = sorted(data)
n = len(data)
i = bisect_left(sample, x - bandwidth) i = bisect_left(sample, x - bandwidth)
j = bisect_right(sample, x + bandwidth) j = bisect_right(sample, x + bandwidth)
supported = sample[i : j] supported = sample[i : j]
return sum((I((x - x_i) / h) for x_i in supported), i) / n return sum((W((x - x_i) / h) for x_i in supported), i) / n
if cumulative: if cumulative:
cdf.__doc__ = f'CDF estimate with {h=!r} and {kernel=!r}' cdf.__doc__ = f'CDF estimate with {h=!r} and {kernel=!r}'