From c56e77a4efb6e41f3b5147cdec63673765f4fb7a Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 26 Jan 2022 21:43:59 +0100 Subject: [PATCH 1/4] Revert "PERF: nancorr pearson (#42761)" This reverts commit 13560bbc --- pandas/_libs/algos.pyx | 62 +++++++++--------------------------------- 1 file changed, 13 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 3d099a53163bc..1b33b90ba0fa5 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -329,12 +329,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): Py_ssize_t i, j, xi, yi, N, K bint minpv float64_t[:, ::1] result - # Initialize to None since we only use in the no missing value case - float64_t[::1] means=None, ssqds=None ndarray[uint8_t, ndim=2] mask - bint no_nans int64_t nobs = 0 - float64_t mean, ssqd, val float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy N, K = (mat).shape @@ -346,57 +342,25 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) - no_nans = mask.all() - - # Computing the online means and variances is expensive - so if possible we can - # precompute these and avoid repeating the computations each time we handle - # an (xi, yi) pair - if no_nans: - means = np.empty(K, dtype=np.float64) - ssqds = np.empty(K, dtype=np.float64) - - with nogil: - for j in range(K): - ssqd = mean = 0 - for i in range(N): - val = mat[i, j] - dx = val - mean - mean += 1 / (i + 1) * dx - ssqd += (val - mean) * dx - - means[j] = mean - ssqds[j] = ssqd with nogil: for xi in range(K): for yi in range(xi + 1): - covxy = 0 - if no_nans: - for i in range(N): + # Welford's method for the variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 + for i in range(N): + if mask[i, xi] and mask[i, yi]: vx = mat[i, xi] vy = mat[i, yi] - covxy += (vx - means[xi]) * (vy - means[yi]) - - ssqdmx = ssqds[xi] - ssqdmy = ssqds[yi] - nobs = N - - else: - nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 - for i in range(N): - # Welford's method for the variance-calculation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - if mask[i, xi] and mask[i, yi]: - vx = mat[i, xi] - vy = mat[i, yi] - nobs += 1 - dx = vx - meanx - dy = vy - meany - meanx += 1 / nobs * dx - meany += 1 / nobs * dy - ssqdmx += (vx - meanx) * dx - ssqdmy += (vy - meany) * dy - covxy += (vx - meanx) * dy + nobs += 1 + dx = vx - meanx + dy = vy - meany + meanx += 1 / nobs * dx + meany += 1 / nobs * dy + ssqdmx += (vx - meanx) * dx + ssqdmy += (vy - meany) * dy + covxy += (vx - meanx) * dy if nobs < minpv: result[xi, yi] = result[yi, xi] = NaN From 54e424969fe46d23a14e88fd87c04fdeeb602918 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 26 Jan 2022 21:45:09 +0100 Subject: [PATCH 2/4] Add note --- doc/source/whatsnew/v1.4.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.1.rst b/doc/source/whatsnew/v1.4.1.rst index 79dae514b77e9..be5fe918f7d17 100644 --- a/doc/source/whatsnew/v1.4.1.rst +++ b/doc/source/whatsnew/v1.4.1.rst @@ -32,7 +32,7 @@ Bug fixes Other ~~~~~ -- +- Reverted performance speedup of nancorr improvement (:issue:`45640`, :issue:`42761`) - .. --------------------------------------------------------------------------- From 0f854fc79e66c6f516ab7497150db1820fcb1751 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 26 Jan 2022 21:55:24 +0100 Subject: [PATCH 3/4] Add test --- pandas/tests/frame/methods/test_cov_corr.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 60d5d8c8ccaca..6a1466ae1ea46 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -337,6 +337,13 @@ def test_corrwith_dup_cols(self): expected = Series(np.ones(4), index=[0, 0, 1, 2]) tm.assert_series_equal(result, expected) + def test_corr_numerical_instabilities(self): + # GH#45640 + df = DataFrame([[0.2, 0.4], [0.4, 0.2]]) + result = df.corr() + expected = DataFrame({0: [1.0, -1.0], 1: [-1.0, 1.0]}) + tm.assert_frame_equal(result - 1, expected - 1, atol=1e-17) + @td.skip_if_no_scipy def test_corrwith_spearman(self): # GH#21925 From db6e0b1454c0d4693b2c25b8b91a1c12332f4da2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 26 Jan 2022 22:04:15 +0100 Subject: [PATCH 4/4] Update doc/source/whatsnew/v1.4.1.rst Co-authored-by: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> --- doc/source/whatsnew/v1.4.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.1.rst b/doc/source/whatsnew/v1.4.1.rst index be5fe918f7d17..1b69c42e90e95 100644 --- a/doc/source/whatsnew/v1.4.1.rst +++ b/doc/source/whatsnew/v1.4.1.rst @@ -32,7 +32,7 @@ Bug fixes Other ~~~~~ -- Reverted performance speedup of nancorr improvement (:issue:`45640`, :issue:`42761`) +- Reverted performance speedup of :meth:`DataFrame.corr` for ``method=pearson`` to fix precision regression (:issue:`45640`, :issue:`42761`) - .. ---------------------------------------------------------------------------