Skip to content

Commit 4efa898

Browse files
dsaxtonproost
authored andcommitted
PERF: Speed up Spearman calculation (pandas-dev#28151)
1 parent d3abd43 commit 4efa898

File tree

3 files changed

+28
-5
lines changed

3 files changed

+28
-5
lines changed

asv_bench/benchmarks/stat_ops.py

+11
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,23 @@ def setup(self, method, use_bottleneck):
113113
nanops._USE_BOTTLENECK = use_bottleneck
114114
self.df = pd.DataFrame(np.random.randn(1000, 30))
115115
self.df2 = pd.DataFrame(np.random.randn(1000, 30))
116+
self.df_wide = pd.DataFrame(np.random.randn(1000, 200))
117+
self.df_wide_nans = self.df_wide.where(np.random.random((1000, 200)) < 0.9)
116118
self.s = pd.Series(np.random.randn(1000))
117119
self.s2 = pd.Series(np.random.randn(1000))
118120

119121
def time_corr(self, method, use_bottleneck):
120122
self.df.corr(method=method)
121123

124+
def time_corr_wide(self, method, use_bottleneck):
125+
self.df_wide.corr(method=method)
126+
127+
def time_corr_wide_nans(self, method, use_bottleneck):
128+
self.df_wide_nans.corr(method=method)
129+
130+
def peakmem_corr_wide(self, method, use_bottleneck):
131+
self.df_wide.corr(method=method)
132+
122133
def time_corr_series(self, method, use_bottleneck):
123134
self.s.corr(self.s2, method=method)
124135

doc/source/whatsnew/v1.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ Performance improvements
7575
- Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`)
7676
- Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`)
7777
- Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`)
78+
- Performance improvement in :meth:`DataFrame.corr` when ``method`` is ``"spearman"`` (:issue:`28139`)
7879
- Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`)
7980

80-
8181
.. _whatsnew_1000.bug_fixes:
8282

8383
Bug fixes

pandas/_libs/algos.pyx

+16-4
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
296296
cdef:
297297
Py_ssize_t i, j, xi, yi, N, K
298298
ndarray[float64_t, ndim=2] result
299+
ndarray[float64_t, ndim=2] ranked_mat
299300
ndarray[float64_t, ndim=1] maskedx
300301
ndarray[float64_t, ndim=1] maskedy
301302
ndarray[uint8_t, ndim=2] mask
@@ -307,10 +308,18 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
307308
result = np.empty((K, K), dtype=np.float64)
308309
mask = np.isfinite(mat).view(np.uint8)
309310

311+
ranked_mat = np.empty((N, K), dtype=np.float64)
312+
313+
for i in range(K):
314+
ranked_mat[:, i] = rank_1d_float64(mat[:, i])
315+
310316
for xi in range(K):
311317
for yi in range(xi + 1):
312318
nobs = 0
319+
# Keep track of whether we need to recompute ranks
320+
all_ranks = True
313321
for i in range(N):
322+
all_ranks &= not (mask[i, xi] ^ mask[i, yi])
314323
if mask[i, xi] and mask[i, yi]:
315324
nobs += 1
316325

@@ -320,13 +329,16 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
320329
maskedx = np.empty(nobs, dtype=np.float64)
321330
maskedy = np.empty(nobs, dtype=np.float64)
322331
j = 0
332+
323333
for i in range(N):
324334
if mask[i, xi] and mask[i, yi]:
325-
maskedx[j] = mat[i, xi]
326-
maskedy[j] = mat[i, yi]
335+
maskedx[j] = ranked_mat[i, xi]
336+
maskedy[j] = ranked_mat[i, yi]
327337
j += 1
328-
maskedx = rank_1d_float64(maskedx)
329-
maskedy = rank_1d_float64(maskedy)
338+
339+
if not all_ranks:
340+
maskedx = rank_1d_float64(maskedx)
341+
maskedy = rank_1d_float64(maskedy)
330342

331343
mean = (nobs + 1) / 2.
332344

0 commit comments

Comments
 (0)