Skip to content

Commit 0c086fe

Browse files
mzeitlin11JulianWgs
authored andcommitted
PERF: nancorr_spearman fastpath (pandas-dev#41885)
1 parent f3263b0 commit 0c086fe

File tree

3 files changed

+70
-37
lines changed

3 files changed

+70
-37
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -842,7 +842,7 @@ Performance improvements
842842
- Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`)
843843
- Performance improvement in :meth:`DataFrame.fillna` with ``method="pad"`` or ``method="backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`)
844844
- Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`)
845-
- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`)
845+
- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`, :issue:`41885`)
846846
- Performance improvement in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` (:issue:`39388`)
847847
- Performance improvement in :meth:`.RollingGroupby.corr`, :meth:`.ExpandingGroupby.corr`, :meth:`.ExpandingGroupby.corr` and :meth:`.ExpandingGroupby.cov` (:issue:`39591`)
848848
- Performance improvement in :func:`unique` for object data type (:issue:`37615`)

pandas/_libs/algos.pyx

+59-36
Original file line numberDiff line numberDiff line change
@@ -387,15 +387,23 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
387387
float64_t[::1] maskedx, maskedy
388388
ndarray[uint8_t, ndim=2] mask
389389
int64_t nobs = 0
390+
bint no_nans
390391
float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
391392
const int64_t[:] labels_n, labels_nobs
392393

393394
N, K = (<object>mat).shape
394395
# For compatibility when calling rank_1d
395396
labels_n = np.zeros(N, dtype=np.int64)
396397

398+
# Handle the edge case where we know all results will be nan
399+
# to keep conditional logic inside loop simpler
400+
if N < minp:
401+
result = np.full((K, K), np.nan, dtype=np.float64)
402+
return result
403+
397404
result = np.empty((K, K), dtype=np.float64)
398405
mask = np.isfinite(mat).view(np.uint8)
406+
no_nans = mask.all()
399407

400408
ranked_mat = np.empty((N, K), dtype=np.float64)
401409

@@ -409,51 +417,66 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
409417
with nogil:
410418
for xi in range(K):
411419
for yi in range(xi + 1):
412-
nobs = 0
413-
# Keep track of whether we need to recompute ranks
414-
all_ranks = True
415-
for i in range(N):
416-
all_ranks &= not (mask[i, xi] ^ mask[i, yi])
417-
if mask[i, xi] and mask[i, yi]:
418-
maskedx[nobs] = ranked_mat[i, xi]
419-
maskedy[nobs] = ranked_mat[i, yi]
420-
nobs += 1
421-
422-
if nobs < minp:
423-
result[xi, yi] = result[yi, xi] = NaN
424-
else:
425-
if not all_ranks:
426-
with gil:
427-
# We need to slice back to nobs because rank_1d will
428-
# require arrays of nobs length
429-
labels_nobs = np.zeros(nobs, dtype=np.int64)
430-
rankedx = rank_1d(np.array(maskedx)[:nobs],
431-
labels=labels_nobs)
432-
rankedy = rank_1d(np.array(maskedy)[:nobs],
433-
labels=labels_nobs)
434-
for i in range(nobs):
435-
maskedx[i] = rankedx[i]
436-
maskedy[i] = rankedy[i]
420+
sumx = sumxx = sumyy = 0
437421

438-
mean = (nobs + 1) / 2.
422+
# Fastpath for data with no nans/infs, allows avoiding mask checks
423+
# and array reassignments
424+
if no_nans:
425+
mean = (N + 1) / 2.
439426

440427
# now the cov numerator
441-
sumx = sumxx = sumyy = 0
442-
443-
for i in range(nobs):
444-
vx = maskedx[i] - mean
445-
vy = maskedy[i] - mean
428+
for i in range(N):
429+
vx = ranked_mat[i, xi] - mean
430+
vy = ranked_mat[i, yi] - mean
446431

447432
sumx += vx * vy
448433
sumxx += vx * vx
449434
sumyy += vy * vy
435+
else:
436+
nobs = 0
437+
# Keep track of whether we need to recompute ranks
438+
all_ranks = True
439+
for i in range(N):
440+
all_ranks &= not (mask[i, xi] ^ mask[i, yi])
441+
if mask[i, xi] and mask[i, yi]:
442+
maskedx[nobs] = ranked_mat[i, xi]
443+
maskedy[nobs] = ranked_mat[i, yi]
444+
nobs += 1
445+
446+
if nobs < minp:
447+
result[xi, yi] = result[yi, xi] = NaN
448+
continue
449+
else:
450+
if not all_ranks:
451+
with gil:
452+
# We need to slice back to nobs because rank_1d will
453+
# require arrays of nobs length
454+
labels_nobs = np.zeros(nobs, dtype=np.int64)
455+
rankedx = rank_1d(np.array(maskedx)[:nobs],
456+
labels=labels_nobs)
457+
rankedy = rank_1d(np.array(maskedy)[:nobs],
458+
labels=labels_nobs)
459+
for i in range(nobs):
460+
maskedx[i] = rankedx[i]
461+
maskedy[i] = rankedy[i]
462+
463+
mean = (nobs + 1) / 2.
464+
465+
# now the cov numerator
466+
for i in range(nobs):
467+
vx = maskedx[i] - mean
468+
vy = maskedy[i] - mean
450469

451-
divisor = sqrt(sumxx * sumyy)
470+
sumx += vx * vy
471+
sumxx += vx * vx
472+
sumyy += vy * vy
452473

453-
if divisor != 0:
454-
result[xi, yi] = result[yi, xi] = sumx / divisor
455-
else:
456-
result[xi, yi] = result[yi, xi] = NaN
474+
divisor = sqrt(sumxx * sumyy)
475+
476+
if divisor != 0:
477+
result[xi, yi] = result[yi, xi] = sumx / divisor
478+
else:
479+
result[xi, yi] = result[yi, xi] = NaN
457480

458481
return result
459482

pandas/tests/frame/methods/test_cov_corr.py

+10
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,16 @@ def test_calc_corr_small_numbers(self):
232232
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
233233
tm.assert_frame_equal(result, expected)
234234

235+
@td.skip_if_no_scipy
236+
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
237+
def test_corr_min_periods_greater_than_length(self, method):
238+
df = DataFrame({"A": [1, 2], "B": [1, 2]})
239+
result = df.corr(method=method, min_periods=3)
240+
expected = DataFrame(
241+
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
242+
)
243+
tm.assert_frame_equal(result, expected)
244+
235245

236246
class TestDataFrameCorrWith:
237247
def test_corrwith(self, datetime_frame):

0 commit comments

Comments
 (0)