Skip to content

Commit 46279be

Browse files
mzeitlin11feefladder
authored andcommitted
PERF: nancorr pearson (pandas-dev#42761)
1 parent 0b35035 commit 46279be

File tree

2 files changed

+51
-13
lines changed

2 files changed

+51
-13
lines changed

doc/source/whatsnew/v1.4.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ Performance improvements
171171
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
172172
- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
173173
- Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`)
174+
- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`)
175+
-
174176

175177
.. ---------------------------------------------------------------------------
176178

pandas/_libs/algos.pyx

+49-13
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,12 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
326326
Py_ssize_t i, j, xi, yi, N, K
327327
bint minpv
328328
float64_t[:, ::1] result
329+
# Initialize to None since we only use in the no missing value case
330+
float64_t[::1] means=None, ssqds=None
329331
ndarray[uint8_t, ndim=2] mask
332+
bint no_nans
330333
int64_t nobs = 0
334+
float64_t mean, ssqd, val
331335
float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy
332336

333337
N, K = (<object>mat).shape
@@ -339,25 +343,57 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
339343

340344
result = np.empty((K, K), dtype=np.float64)
341345
mask = np.isfinite(mat).view(np.uint8)
346+
no_nans = mask.all()
347+
348+
# Computing the online means and variances is expensive - so if possible we can
349+
# precompute these and avoid repeating the computations each time we handle
350+
# an (xi, yi) pair
351+
if no_nans:
352+
means = np.empty(K, dtype=np.float64)
353+
ssqds = np.empty(K, dtype=np.float64)
354+
355+
with nogil:
356+
for j in range(K):
357+
ssqd = mean = 0
358+
for i in range(N):
359+
val = mat[i, j]
360+
dx = val - mean
361+
mean += 1 / (i + 1) * dx
362+
ssqd += (val - mean) * dx
363+
364+
means[j] = mean
365+
ssqds[j] = ssqd
342366

343367
with nogil:
344368
for xi in range(K):
345369
for yi in range(xi + 1):
346-
# Welford's method for the variance-calculation
347-
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
348-
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
349-
for i in range(N):
350-
if mask[i, xi] and mask[i, yi]:
370+
covxy = 0
371+
if no_nans:
372+
for i in range(N):
351373
vx = mat[i, xi]
352374
vy = mat[i, yi]
353-
nobs += 1
354-
dx = vx - meanx
355-
dy = vy - meany
356-
meanx += 1 / nobs * dx
357-
meany += 1 / nobs * dy
358-
ssqdmx += (vx - meanx) * dx
359-
ssqdmy += (vy - meany) * dy
360-
covxy += (vx - meanx) * dy
375+
covxy += (vx - means[xi]) * (vy - means[yi])
376+
377+
ssqdmx = ssqds[xi]
378+
ssqdmy = ssqds[yi]
379+
nobs = N
380+
381+
else:
382+
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
383+
for i in range(N):
384+
# Welford's method for the variance-calculation
385+
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
386+
if mask[i, xi] and mask[i, yi]:
387+
vx = mat[i, xi]
388+
vy = mat[i, yi]
389+
nobs += 1
390+
dx = vx - meanx
391+
dy = vy - meany
392+
meanx += 1 / nobs * dx
393+
meany += 1 / nobs * dy
394+
ssqdmx += (vx - meanx) * dx
395+
ssqdmy += (vy - meany) * dy
396+
covxy += (vx - meanx) * dy
361397

362398
if nobs < minpv:
363399
result[xi, yi] = result[yi, xi] = NaN

0 commit comments

Comments
 (0)