diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 6032bee41958e..ed5ebfa61594e 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -113,12 +113,23 @@ def setup(self, method, use_bottleneck): nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) self.df2 = pd.DataFrame(np.random.randn(1000, 30)) + self.df_wide = pd.DataFrame(np.random.randn(1000, 200)) + self.df_wide_nans = self.df_wide.where(np.random.random((1000, 200)) < 0.9) self.s = pd.Series(np.random.randn(1000)) self.s2 = pd.Series(np.random.randn(1000)) def time_corr(self, method, use_bottleneck): self.df.corr(method=method) + def time_corr_wide(self, method, use_bottleneck): + self.df_wide.corr(method=method) + + def time_corr_wide_nans(self, method, use_bottleneck): + self.df_wide_nans.corr(method=method) + + def peakmem_corr_wide(self, method, use_bottleneck): + self.df_wide.corr(method=method) + def time_corr_series(self, method, use_bottleneck): self.s.corr(self.s2, method=method) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0d2b81eca6789..61e877e3d44ba 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -75,9 +75,9 @@ Performance improvements - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`) - Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`) - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`) +- Performance improvement in :meth:`DataFrame.corr` when ``method`` is ``"spearman"`` (:issue:`28139`) - Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) - .. _whatsnew_1000.bug_fixes: Bug fixes diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 038447ad252fe..0f91f612994c7 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -296,6 +296,7 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): cdef: Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result + ndarray[float64_t, ndim=2] ranked_mat ndarray[float64_t, ndim=1] maskedx ndarray[float64_t, ndim=1] maskedy ndarray[uint8_t, ndim=2] mask @@ -307,10 +308,18 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) + ranked_mat = np.empty((N, K), dtype=np.float64) + + for i in range(K): + ranked_mat[:, i] = rank_1d_float64(mat[:, i]) + for xi in range(K): for yi in range(xi + 1): nobs = 0 + # Keep track of whether we need to recompute ranks + all_ranks = True for i in range(N): + all_ranks &= not (mask[i, xi] ^ mask[i, yi]) if mask[i, xi] and mask[i, yi]: nobs += 1 @@ -320,13 +329,16 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): maskedx = np.empty(nobs, dtype=np.float64) maskedy = np.empty(nobs, dtype=np.float64) j = 0 + for i in range(N): if mask[i, xi] and mask[i, yi]: - maskedx[j] = mat[i, xi] - maskedy[j] = mat[i, yi] + maskedx[j] = ranked_mat[i, xi] + maskedy[j] = ranked_mat[i, yi] j += 1 - maskedx = rank_1d_float64(maskedx) - maskedy = rank_1d_float64(maskedy) + + if not all_ranks: + maskedx = rank_1d_float64(maskedx) + maskedy = rank_1d_float64(maskedy) mean = (nobs + 1) / 2.