Skip to content

Commit 6485e48

Browse files
mzeitlin11JulianWgs
authored andcommitted
PERF: nancorr_spearman (pandas-dev#41857)
1 parent 9ad2fbb commit 6485e48

File tree

2 files changed

+51
-45
lines changed

2 files changed

+51
-45
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -859,6 +859,7 @@ Performance improvements
859859
- Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`)
860860
- Performance improvement in :meth:`DataFrame.fillna` with ``method="pad|backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`)
861861
- Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`)
862+
- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`)
862863
- Performance improvement in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` (:issue:`39388`)
863864
- Performance improvement in :meth:`.RollingGroupby.corr`, :meth:`.ExpandingGroupby.corr`, :meth:`.ExpandingGroupby.corr` and :meth:`.ExpandingGroupby.cov` (:issue:`39591`)
864865
- Performance improvement in :func:`unique` for object data type (:issue:`37615`)

pandas/_libs/algos.pyx

+50-45
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
383383
Py_ssize_t i, j, xi, yi, N, K
384384
ndarray[float64_t, ndim=2] result
385385
ndarray[float64_t, ndim=2] ranked_mat
386-
ndarray[float64_t, ndim=1] maskedx
387-
ndarray[float64_t, ndim=1] maskedy
386+
ndarray[float64_t, ndim=1] rankedx, rankedy
387+
float64_t[::1] maskedx, maskedy
388388
ndarray[uint8_t, ndim=2] mask
389389
int64_t nobs = 0
390390
float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
@@ -399,56 +399,61 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
399399

400400
ranked_mat = np.empty((N, K), dtype=np.float64)
401401

402+
# Note: we index into maskedx, maskedy in loops up to nobs, but using N is safe
403+
# here since N >= nobs and values are stored contiguously
404+
maskedx = np.empty(N, dtype=np.float64)
405+
maskedy = np.empty(N, dtype=np.float64)
402406
for i in range(K):
403407
ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n)
404408

405-
for xi in range(K):
406-
for yi in range(xi + 1):
407-
nobs = 0
408-
# Keep track of whether we need to recompute ranks
409-
all_ranks = True
410-
for i in range(N):
411-
all_ranks &= not (mask[i, xi] ^ mask[i, yi])
412-
if mask[i, xi] and mask[i, yi]:
413-
nobs += 1
414-
415-
if nobs < minp:
416-
result[xi, yi] = result[yi, xi] = NaN
417-
else:
418-
maskedx = np.empty(nobs, dtype=np.float64)
419-
maskedy = np.empty(nobs, dtype=np.float64)
420-
j = 0
421-
409+
with nogil:
410+
for xi in range(K):
411+
for yi in range(xi + 1):
412+
nobs = 0
413+
# Keep track of whether we need to recompute ranks
414+
all_ranks = True
422415
for i in range(N):
416+
all_ranks &= not (mask[i, xi] ^ mask[i, yi])
423417
if mask[i, xi] and mask[i, yi]:
424-
maskedx[j] = ranked_mat[i, xi]
425-
maskedy[j] = ranked_mat[i, yi]
426-
j += 1
427-
428-
if not all_ranks:
429-
labels_nobs = np.zeros(nobs, dtype=np.int64)
430-
maskedx = rank_1d(maskedx, labels=labels_nobs)
431-
maskedy = rank_1d(maskedy, labels=labels_nobs)
432-
433-
mean = (nobs + 1) / 2.
434-
435-
# now the cov numerator
436-
sumx = sumxx = sumyy = 0
437-
438-
for i in range(nobs):
439-
vx = maskedx[i] - mean
440-
vy = maskedy[i] - mean
441-
442-
sumx += vx * vy
443-
sumxx += vx * vx
444-
sumyy += vy * vy
445-
446-
divisor = sqrt(sumxx * sumyy)
418+
maskedx[nobs] = ranked_mat[i, xi]
419+
maskedy[nobs] = ranked_mat[i, yi]
420+
nobs += 1
447421

448-
if divisor != 0:
449-
result[xi, yi] = result[yi, xi] = sumx / divisor
450-
else:
422+
if nobs < minp:
451423
result[xi, yi] = result[yi, xi] = NaN
424+
else:
425+
if not all_ranks:
426+
with gil:
427+
# We need to slice back to nobs because rank_1d will
428+
# require arrays of nobs length
429+
labels_nobs = np.zeros(nobs, dtype=np.int64)
430+
rankedx = rank_1d(np.array(maskedx)[:nobs],
431+
labels=labels_nobs)
432+
rankedy = rank_1d(np.array(maskedy)[:nobs],
433+
labels=labels_nobs)
434+
for i in range(nobs):
435+
maskedx[i] = rankedx[i]
436+
maskedy[i] = rankedy[i]
437+
438+
mean = (nobs + 1) / 2.
439+
440+
# now the cov numerator
441+
sumx = sumxx = sumyy = 0
442+
443+
for i in range(nobs):
444+
vx = maskedx[i] - mean
445+
vy = maskedy[i] - mean
446+
447+
sumx += vx * vy
448+
sumxx += vx * vx
449+
sumyy += vy * vy
450+
451+
divisor = sqrt(sumxx * sumyy)
452+
453+
if divisor != 0:
454+
result[xi, yi] = result[yi, xi] = sumx / divisor
455+
else:
456+
result[xi, yi] = result[yi, xi] = NaN
452457

453458
return result
454459

0 commit comments

Comments
 (0)