Skip to content

Commit b33c630

Browse files
authored
Revert "PERF: nancorr pearson (#42761)" (#45646)
1 parent 66381d3 commit b33c630

File tree

3 files changed

+21
-50
lines changed

3 files changed

+21
-50
lines changed

doc/source/whatsnew/v1.4.1.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ Bug fixes
3232

3333
Other
3434
~~~~~
35-
-
35+
- Reverted performance speedup of :meth:`DataFrame.corr` for ``method=pearson`` to fix precision regression (:issue:`45640`, :issue:`42761`)
3636
-
3737

3838
.. ---------------------------------------------------------------------------

pandas/_libs/algos.pyx

+13-49
Original file line numberDiff line numberDiff line change
@@ -329,12 +329,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
329329
Py_ssize_t i, j, xi, yi, N, K
330330
bint minpv
331331
float64_t[:, ::1] result
332-
# Initialize to None since we only use in the no missing value case
333-
float64_t[::1] means=None, ssqds=None
334332
ndarray[uint8_t, ndim=2] mask
335-
bint no_nans
336333
int64_t nobs = 0
337-
float64_t mean, ssqd, val
338334
float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy
339335

340336
N, K = (<object>mat).shape
@@ -346,57 +342,25 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
346342

347343
result = np.empty((K, K), dtype=np.float64)
348344
mask = np.isfinite(mat).view(np.uint8)
349-
no_nans = mask.all()
350-
351-
# Computing the online means and variances is expensive - so if possible we can
352-
# precompute these and avoid repeating the computations each time we handle
353-
# an (xi, yi) pair
354-
if no_nans:
355-
means = np.empty(K, dtype=np.float64)
356-
ssqds = np.empty(K, dtype=np.float64)
357-
358-
with nogil:
359-
for j in range(K):
360-
ssqd = mean = 0
361-
for i in range(N):
362-
val = mat[i, j]
363-
dx = val - mean
364-
mean += 1 / (i + 1) * dx
365-
ssqd += (val - mean) * dx
366-
367-
means[j] = mean
368-
ssqds[j] = ssqd
369345

370346
with nogil:
371347
for xi in range(K):
372348
for yi in range(xi + 1):
373-
covxy = 0
374-
if no_nans:
375-
for i in range(N):
349+
# Welford's method for the variance-calculation
350+
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
351+
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
352+
for i in range(N):
353+
if mask[i, xi] and mask[i, yi]:
376354
vx = mat[i, xi]
377355
vy = mat[i, yi]
378-
covxy += (vx - means[xi]) * (vy - means[yi])
379-
380-
ssqdmx = ssqds[xi]
381-
ssqdmy = ssqds[yi]
382-
nobs = N
383-
384-
else:
385-
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
386-
for i in range(N):
387-
# Welford's method for the variance-calculation
388-
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
389-
if mask[i, xi] and mask[i, yi]:
390-
vx = mat[i, xi]
391-
vy = mat[i, yi]
392-
nobs += 1
393-
dx = vx - meanx
394-
dy = vy - meany
395-
meanx += 1 / nobs * dx
396-
meany += 1 / nobs * dy
397-
ssqdmx += (vx - meanx) * dx
398-
ssqdmy += (vy - meany) * dy
399-
covxy += (vx - meanx) * dy
356+
nobs += 1
357+
dx = vx - meanx
358+
dy = vy - meany
359+
meanx += 1 / nobs * dx
360+
meany += 1 / nobs * dy
361+
ssqdmx += (vx - meanx) * dx
362+
ssqdmy += (vy - meany) * dy
363+
covxy += (vx - meanx) * dy
400364

401365
if nobs < minpv:
402366
result[xi, yi] = result[yi, xi] = NaN

pandas/tests/frame/methods/test_cov_corr.py

+7
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,13 @@ def test_corrwith_dup_cols(self):
337337
expected = Series(np.ones(4), index=[0, 0, 1, 2])
338338
tm.assert_series_equal(result, expected)
339339

340+
def test_corr_numerical_instabilities(self):
341+
# GH#45640
342+
df = DataFrame([[0.2, 0.4], [0.4, 0.2]])
343+
result = df.corr()
344+
expected = DataFrame({0: [1.0, -1.0], 1: [-1.0, 1.0]})
345+
tm.assert_frame_equal(result - 1, expected - 1, atol=1e-17)
346+
340347
@td.skip_if_no_scipy
341348
def test_corrwith_spearman(self):
342349
# GH#21925

0 commit comments

Comments
 (0)