Skip to content

Commit 168d78d

Browse files
committed
Use welford to calculate corr
1 parent daaaabd commit 168d78d

File tree

2 files changed

+23
-22
lines changed

2 files changed

+23
-22
lines changed

pandas/_libs/algos.pyx

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
268268
ndarray[float64_t, ndim=2] result
269269
ndarray[uint8_t, ndim=2] mask
270270
int64_t nobs = 0
271-
float64_t vx, vy, sumx, sumy, sumxx, sumyy, meanx, meany, divisor
271+
float64_t vx, vy, meanx, meany, divisor, prev_meany, prev_meanx, ssqdmx, ssqdmy, covxy
272272

273273
N, K = (<object>mat).shape
274274

@@ -283,37 +283,28 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
283283
with nogil:
284284
for xi in range(K):
285285
for yi in range(xi + 1):
286-
nobs = sumxx = sumyy = sumx = sumy = 0
286+
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
287287
for i in range(N):
288288
if mask[i, xi] and mask[i, yi]:
289289
vx = mat[i, xi]
290290
vy = mat[i, yi]
291291
nobs += 1
292-
sumx += vx
293-
sumy += vy
292+
prev_meanx = meanx
293+
prev_meany = meany
294+
meanx = meanx + 1 / nobs * (vx - meanx)
295+
meany = meany + 1 / nobs * (vy - meany)
296+
ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx)
297+
ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany)
298+
covxy = covxy + (vx - meanx) * (vy - prev_meany)
294299

295300
if nobs < minpv:
296301
result[xi, yi] = result[yi, xi] = NaN
297302
else:
298-
meanx = sumx / nobs
299-
meany = sumy / nobs
300-
301-
# now the cov numerator
302-
sumx = 0
303-
for i in range(N):
304-
if mask[i, xi] and mask[i, yi]:
305-
vx = mat[i, xi] - meanx
306-
vy = mat[i, yi] - meany
307-
308-
sumx += vx * vy
309-
sumxx += vx * vx
310-
sumyy += vy * vy
311-
312-
divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy)
303+
divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
313304

314305
# numerical issues for constant columns
315-
if divisor > 1e-15:
316-
result[xi, yi] = result[yi, xi] = sumx / divisor
306+
if divisor != 0:
307+
result[xi, yi] = result[yi, xi] = covxy / divisor
317308
else:
318309
result[xi, yi] = result[yi, xi] = NaN
319310

@@ -323,6 +314,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
323314
# Pairwise Spearman correlation
324315

325316

317+
326318
@cython.boundscheck(False)
327319
@cython.wraparound(False)
328320
def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1) -> ndarray:

pandas/tests/frame/methods/test_cov_corr.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def test_corr_item_cache(self):
208208
assert df["A"] is ser
209209
assert df.values[0, 0] == 99
210210

211-
@pytest.mark.parametrize("length", [2, 20, 200, 2000, 20000])
211+
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
212212
def test_corr_for_constant_columns(self, length):
213213
# GH: 37448
214214
df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
@@ -218,6 +218,15 @@ def test_corr_for_constant_columns(self, length):
218218
)
219219
tm.assert_frame_equal(result, expected)
220220

221+
def test_calc_corr_small_numbers(self):
222+
# GH: 37452
223+
df = DataFrame(
224+
{"A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20]}
225+
)
226+
result = df.corr()
227+
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
228+
tm.assert_frame_equal(result, expected)
229+
221230

222231
class TestDataFrameCorrWith:
223232
def test_corrwith(self, datetime_frame):

0 commit comments

Comments
 (0)