Skip to content

Commit 0c4012d

Browse files
phoflukarroum
authored andcommitted
ENH: Improve numerical stability for Pearson corr() and cov() (pandas-dev#37453)
1 parent b362d5f commit 0c4012d

File tree

3 files changed

+34
-21
lines changed

3 files changed

+34
-21
lines changed

doc/source/whatsnew/v1.2.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ Other enhancements
221221
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
222222
- Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`).
223223
- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
224+
- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`)
224225
- :meth:`DataFrame.plot` now recognizes ``xlabel`` and ``ylabel`` arguments for plots of type ``scatter`` and ``hexbin`` (:issue:`37001`)
225226
- :class:`DataFrame` now supports ``divmod`` operation (:issue:`37165`)
226227
- :meth:`DataFrame.to_parquet` now returns a ``bytes`` object when no ``path`` argument is passed (:issue:`37105`)

pandas/_libs/algos.pyx

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
268268
ndarray[float64_t, ndim=2] result
269269
ndarray[uint8_t, ndim=2] mask
270270
int64_t nobs = 0
271-
float64_t vx, vy, sumx, sumy, sumxx, sumyy, meanx, meany, divisor
271+
float64_t vx, vy, meanx, meany, divisor, prev_meany, prev_meanx, ssqdmx
272+
float64_t ssqdmy, covxy
272273

273274
N, K = (<object>mat).shape
274275

@@ -283,37 +284,29 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
283284
with nogil:
284285
for xi in range(K):
285286
for yi in range(xi + 1):
286-
nobs = sumxx = sumyy = sumx = sumy = 0
287+
# Welford's method for the variance-calculation
288+
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
289+
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
287290
for i in range(N):
288291
if mask[i, xi] and mask[i, yi]:
289292
vx = mat[i, xi]
290293
vy = mat[i, yi]
291294
nobs += 1
292-
sumx += vx
293-
sumy += vy
295+
prev_meanx = meanx
296+
prev_meany = meany
297+
meanx = meanx + 1 / nobs * (vx - meanx)
298+
meany = meany + 1 / nobs * (vy - meany)
299+
ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx)
300+
ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany)
301+
covxy = covxy + (vx - meanx) * (vy - prev_meany)
294302

295303
if nobs < minpv:
296304
result[xi, yi] = result[yi, xi] = NaN
297305
else:
298-
meanx = sumx / nobs
299-
meany = sumy / nobs
300-
301-
# now the cov numerator
302-
sumx = 0
303-
304-
for i in range(N):
305-
if mask[i, xi] and mask[i, yi]:
306-
vx = mat[i, xi] - meanx
307-
vy = mat[i, yi] - meany
308-
309-
sumx += vx * vy
310-
sumxx += vx * vx
311-
sumyy += vy * vy
312-
313-
divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy)
306+
divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
314307

315308
if divisor != 0:
316-
result[xi, yi] = result[yi, xi] = sumx / divisor
309+
result[xi, yi] = result[yi, xi] = covxy / divisor
317310
else:
318311
result[xi, yi] = result[yi, xi] = NaN
319312

pandas/tests/frame/methods/test_cov_corr.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,25 @@ def test_corr_item_cache(self):
208208
assert df["A"] is ser
209209
assert df.values[0, 0] == 99
210210

211+
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
212+
def test_corr_for_constant_columns(self, length):
213+
# GH: 37448
214+
df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
215+
result = df.corr()
216+
expected = DataFrame(
217+
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
218+
)
219+
tm.assert_frame_equal(result, expected)
220+
221+
def test_calc_corr_small_numbers(self):
222+
# GH: 37452
223+
df = DataFrame(
224+
{"A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20]}
225+
)
226+
result = df.corr()
227+
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
228+
tm.assert_frame_equal(result, expected)
229+
211230

212231
class TestDataFrameCorrWith:
213232
def test_corrwith(self, datetime_frame):

0 commit comments

Comments
 (0)