From 9f8e3cffaad603b7cb4e72f0dec949e49a863ac5 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Sun, 25 Aug 2019 19:42:19 -0500 Subject: [PATCH 01/15] Rank outside loop --- pandas/_libs/algos.pyx | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 038447ad252fe..e288e845dfb34 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -296,6 +296,7 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): cdef: Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result + ndarray[float64_t, ndim=2] ranked_mat ndarray[float64_t, ndim=1] maskedx ndarray[float64_t, ndim=1] maskedy ndarray[uint8_t, ndim=2] mask @@ -307,6 +308,11 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) + ranked_mat = np.empty((N, K), dtype=np.float64) + + for i in range(K): + ranked_mat[:, i] = rank_1d_float64(mat[:, i]) + for xi in range(K): for yi in range(xi + 1): nobs = 0 @@ -322,11 +328,9 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): j = 0 for i in range(N): if mask[i, xi] and mask[i, yi]: - maskedx[j] = mat[i, xi] - maskedy[j] = mat[i, yi] + maskedx[j] = ranked_mat[i, xi] + maskedy[j] = ranked_mat[i, yi] j += 1 - maskedx = rank_1d_float64(maskedx) - maskedy = rank_1d_float64(maskedy) mean = (nobs + 1) / 2. From 08d8e394c30a83cd35a05c1f41f8c4ad94addd0e Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Mon, 26 Aug 2019 08:38:52 -0500 Subject: [PATCH 02/15] Check for same missing pattern --- pandas/_libs/algos.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index e288e845dfb34..af5a7d1666420 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -316,7 +316,9 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): for xi in range(K): for yi in range(xi + 1): nobs = 0 + same_miss_pat = True for i in range(N): + same_miss_pat &= not (mask[i, xi] ^ mask[i, yi]) if mask[i, xi] and mask[i, yi]: nobs += 1 @@ -326,12 +328,17 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): maskedx = np.empty(nobs, dtype=np.float64) maskedy = np.empty(nobs, dtype=np.float64) j = 0 + for i in range(N): if mask[i, xi] and mask[i, yi]: maskedx[j] = ranked_mat[i, xi] maskedy[j] = ranked_mat[i, yi] j += 1 + if not same_miss_pat: + maskedx = rank_1d_float64(maskedx) + maskedy = rank_1d_float64(maskedy) + mean = (nobs + 1) / 2. # now the cov numerator From 1118a3c37d510c6fa3e0cef6952484e108979ecb Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Mon, 26 Aug 2019 12:38:18 -0500 Subject: [PATCH 03/15] Add comment --- pandas/_libs/algos.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index af5a7d1666420..3240216eda246 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -316,6 +316,8 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): for xi in range(K): for yi in range(xi + 1): nobs = 0 + # Keep track of whether the two columns have the same + # missing pattern, if not we need to recalculate ranks same_miss_pat = True for i in range(N): same_miss_pat &= not (mask[i, xi] ^ mask[i, yi]) From 14e7f5166ef28dc71763e1a3b005f080e256e39c Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Tue, 27 Aug 2019 18:14:21 -0500 Subject: [PATCH 04/15] Avoid unnecessary ranking --- pandas/_libs/algos.pyx | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 3240216eda246..ccddb630427fb 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -309,21 +309,27 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): mask = np.isfinite(mat).view(np.uint8) ranked_mat = np.empty((N, K), dtype=np.float64) - - for i in range(K): - ranked_mat[:, i] = rank_1d_float64(mat[:, i]) + cached_ranks = set({}) for xi in range(K): for yi in range(xi + 1): nobs = 0 # Keep track of whether the two columns have the same - # missing pattern, if not we need to recalculate ranks + # missing pattern, if so save off the ranks same_miss_pat = True for i in range(N): same_miss_pat &= not (mask[i, xi] ^ mask[i, yi]) if mask[i, xi] and mask[i, yi]: nobs += 1 + if same_miss_pat: + if xi not in cached_ranks: + ranked_mat[:, xi] = rank_1d_float64(mat[:, xi]) + cached_ranks.add(xi) + if yi not in cached_ranks: + ranked_mat[:, yi] = rank_1d_float64(mat[:, yi]) + cached_ranks.add(yi) + if nobs < minp: result[xi, yi] = result[yi, xi] = NaN else: @@ -333,8 +339,8 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): for i in range(N): if mask[i, xi] and mask[i, yi]: - maskedx[j] = ranked_mat[i, xi] - maskedy[j] = ranked_mat[i, yi] + maskedx[j] = ranked_mat[i, xi] if same_miss_pat else mat[i, xi] + maskedy[j] = ranked_mat[i, yi] if same_miss_pat else mat[i, yi] j += 1 if not same_miss_pat: From 316999610e61971da1fabe411997518f9a76f38b Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Tue, 27 Aug 2019 19:50:37 -0500 Subject: [PATCH 05/15] Rename same_miss_pat --- pandas/_libs/algos.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index ccddb630427fb..2a1b55857831e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -316,13 +316,13 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): nobs = 0 # Keep track of whether the two columns have the same # missing pattern, if so save off the ranks - same_miss_pat = True + all_ranks = True for i in range(N): - same_miss_pat &= not (mask[i, xi] ^ mask[i, yi]) + all_ranks &= not (mask[i, xi] ^ mask[i, yi]) if mask[i, xi] and mask[i, yi]: nobs += 1 - if same_miss_pat: + if all_ranks: if xi not in cached_ranks: ranked_mat[:, xi] = rank_1d_float64(mat[:, xi]) cached_ranks.add(xi) @@ -339,11 +339,11 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): for i in range(N): if mask[i, xi] and mask[i, yi]: - maskedx[j] = ranked_mat[i, xi] if same_miss_pat else mat[i, xi] - maskedy[j] = ranked_mat[i, yi] if same_miss_pat else mat[i, yi] + maskedx[j] = ranked_mat[i, xi] if all_ranks else mat[i, xi] + maskedy[j] = ranked_mat[i, yi] if all_ranks else mat[i, yi] j += 1 - if not same_miss_pat: + if not all_ranks: maskedx = rank_1d_float64(maskedx) maskedy = rank_1d_float64(maskedy) From 23402261fb718d540271ef0884b5bf14f9161b43 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Tue, 3 Sep 2019 19:30:45 -0500 Subject: [PATCH 06/15] Add wide frame to benchmarks --- asv_bench/benchmarks/stat_ops.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 620a6de0f5f34..1428b5d496d6d 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -113,12 +113,16 @@ def setup(self, method, use_bottleneck): nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) self.df2 = pd.DataFrame(np.random.randn(1000, 30)) + self.df_wide = pd.DataFrame(np.random.randn(1000, 500)) self.s = pd.Series(np.random.randn(1000)) self.s2 = pd.Series(np.random.randn(1000)) def time_corr(self, method, use_bottleneck): self.df.corr(method=method) + def time_corr_wide(self, method, use_bottleneck): + self.df_wide.corr(method=method) + def time_corr_series(self, method, use_bottleneck): self.s.corr(self.s2, method=method) From 913193d7d80b6ac135501fb67a2421d5a40ea0fc Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Tue, 3 Sep 2019 19:41:26 -0500 Subject: [PATCH 07/15] Add performance note --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 8e25857e5ad69..6d5d8cd36ba25 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -76,7 +76,7 @@ Performance improvements - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`) - Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`) - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`) - +- Performance improvement in :meth:`DataFrame.corr` when ``method`` is ``"spearman"`` (:issue:`28139`) .. _whatsnew_1000.bug_fixes: From 30cfc388a755c9f2a886ca1c0fa26e88e68d504e Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Tue, 3 Sep 2019 20:54:43 -0500 Subject: [PATCH 08/15] Add peakmem benchmark --- asv_bench/benchmarks/stat_ops.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 1428b5d496d6d..a20835010e4eb 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -123,6 +123,9 @@ def time_corr(self, method, use_bottleneck): def time_corr_wide(self, method, use_bottleneck): self.df_wide.corr(method=method) + def peakmem_corr_wide(self, method, use_bottleneck): + self.df_wide.corr(method=method) + def time_corr_series(self, method, use_bottleneck): self.s.corr(self.s2, method=method) From 420b1d6af8c92501a8b2366225ffd3d0887b6da8 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Tue, 3 Sep 2019 21:48:20 -0500 Subject: [PATCH 09/15] Add nans to benchmark data --- asv_bench/benchmarks/stat_ops.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index a20835010e4eb..c971ce7b54369 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -114,6 +114,7 @@ def setup(self, method, use_bottleneck): self.df = pd.DataFrame(np.random.randn(1000, 30)) self.df2 = pd.DataFrame(np.random.randn(1000, 30)) self.df_wide = pd.DataFrame(np.random.randn(1000, 500)) + self.df_wide_nans = self.df_wide.where(np.random.random((1000, 500)) < 0.9) self.s = pd.Series(np.random.randn(1000)) self.s2 = pd.Series(np.random.randn(1000)) @@ -123,6 +124,9 @@ def time_corr(self, method, use_bottleneck): def time_corr_wide(self, method, use_bottleneck): self.df_wide.corr(method=method) + def time_corr_wide_nans(self, method, use_bottleneck): + self.df_wide_nans.corr(method=method) + def peakmem_corr_wide(self, method, use_bottleneck): self.df_wide.corr(method=method) From 15f70e22b41599488dcff0a0a61fe706d73937d9 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Thu, 5 Sep 2019 17:50:23 -0500 Subject: [PATCH 10/15] Revert "Avoid unnecessary ranking" This reverts commit 14e7f5166ef28dc71763e1a3b005f080e256e39c. --- pandas/_libs/algos.pyx | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 2a1b55857831e..3171553a279aa 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -309,7 +309,9 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): mask = np.isfinite(mat).view(np.uint8) ranked_mat = np.empty((N, K), dtype=np.float64) - cached_ranks = set({}) + + for i in range(K): + ranked_mat[:, i] = rank_1d_float64(mat[:, i]) for xi in range(K): for yi in range(xi + 1): @@ -322,14 +324,6 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): if mask[i, xi] and mask[i, yi]: nobs += 1 - if all_ranks: - if xi not in cached_ranks: - ranked_mat[:, xi] = rank_1d_float64(mat[:, xi]) - cached_ranks.add(xi) - if yi not in cached_ranks: - ranked_mat[:, yi] = rank_1d_float64(mat[:, yi]) - cached_ranks.add(yi) - if nobs < minp: result[xi, yi] = result[yi, xi] = NaN else: @@ -339,8 +333,8 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): for i in range(N): if mask[i, xi] and mask[i, yi]: - maskedx[j] = ranked_mat[i, xi] if all_ranks else mat[i, xi] - maskedy[j] = ranked_mat[i, yi] if all_ranks else mat[i, yi] + maskedx[j] = ranked_mat[i, xi] + maskedy[j] = ranked_mat[i, yi] j += 1 if not all_ranks: From ae3e3380efc48f21ce57770e537754b2ac8e0f65 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Thu, 5 Sep 2019 18:57:16 -0500 Subject: [PATCH 11/15] Edit comment --- pandas/_libs/algos.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 3171553a279aa..0f91f612994c7 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -316,8 +316,7 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): for xi in range(K): for yi in range(xi + 1): nobs = 0 - # Keep track of whether the two columns have the same - # missing pattern, if so save off the ranks + # Keep track of whether we need to recompute ranks all_ranks = True for i in range(N): all_ranks &= not (mask[i, xi] ^ mask[i, yi]) From 01cb456bc11e982453e2718de83526ea256532d7 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Thu, 5 Sep 2019 19:02:55 -0500 Subject: [PATCH 12/15] Raise for Kendall benchmark --- asv_bench/benchmarks/stat_ops.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 3cf1e9f88764b..de2ebd8068f0a 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -122,9 +122,15 @@ def time_corr(self, method, use_bottleneck): self.df.corr(method=method) def time_corr_wide(self, method, use_bottleneck): + # Raise here until benchmark can pass + if method == "kendall": + raise NotImplementedError self.df_wide.corr(method=method) def time_corr_wide_nans(self, method, use_bottleneck): + # Raise here until benchmark can pass + if method == "kendall": + raise NotImplementedError self.df_wide_nans.corr(method=method) def peakmem_corr_wide(self, method, use_bottleneck): From 3e2da442898ea09675e3e01bd85f262b66a491f2 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Thu, 5 Sep 2019 19:36:10 -0500 Subject: [PATCH 13/15] Try returning None --- asv_bench/benchmarks/stat_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index de2ebd8068f0a..182ad4bd3a303 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -124,13 +124,13 @@ def time_corr(self, method, use_bottleneck): def time_corr_wide(self, method, use_bottleneck): # Raise here until benchmark can pass if method == "kendall": - raise NotImplementedError + return None self.df_wide.corr(method=method) def time_corr_wide_nans(self, method, use_bottleneck): # Raise here until benchmark can pass if method == "kendall": - raise NotImplementedError + return None self.df_wide_nans.corr(method=method) def peakmem_corr_wide(self, method, use_bottleneck): From 738433a416148f9d60fd4fce143b58791349e1c7 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Fri, 6 Sep 2019 18:30:38 -0500 Subject: [PATCH 14/15] Fix benchmark --- asv_bench/benchmarks/stat_ops.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 182ad4bd3a303..825277728b19a 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -113,8 +113,8 @@ def setup(self, method, use_bottleneck): nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) self.df2 = pd.DataFrame(np.random.randn(1000, 30)) - self.df_wide = pd.DataFrame(np.random.randn(1000, 500)) - self.df_wide_nans = self.df_wide.where(np.random.random((1000, 500)) < 0.9) + self.df_wide = pd.DataFrame(np.random.randn(1000, 300)) + self.df_wide_nans = self.df_wide.where(np.random.random((1000, 300)) < 0.9) self.s = pd.Series(np.random.randn(1000)) self.s2 = pd.Series(np.random.randn(1000)) @@ -122,15 +122,9 @@ def time_corr(self, method, use_bottleneck): self.df.corr(method=method) def time_corr_wide(self, method, use_bottleneck): - # Raise here until benchmark can pass - if method == "kendall": - return None self.df_wide.corr(method=method) def time_corr_wide_nans(self, method, use_bottleneck): - # Raise here until benchmark can pass - if method == "kendall": - return None self.df_wide_nans.corr(method=method) def peakmem_corr_wide(self, method, use_bottleneck): From 92d813429502f0439b4a6b20273be4bf24b42dcb Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Fri, 6 Sep 2019 20:30:51 -0500 Subject: [PATCH 15/15] Update benchmark data size --- asv_bench/benchmarks/stat_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 825277728b19a..ed5ebfa61594e 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -113,8 +113,8 @@ def setup(self, method, use_bottleneck): nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) self.df2 = pd.DataFrame(np.random.randn(1000, 30)) - self.df_wide = pd.DataFrame(np.random.randn(1000, 300)) - self.df_wide_nans = self.df_wide.where(np.random.random((1000, 300)) < 0.9) + self.df_wide = pd.DataFrame(np.random.randn(1000, 200)) + self.df_wide_nans = self.df_wide.where(np.random.random((1000, 200)) < 0.9) self.s = pd.Series(np.random.randn(1000)) self.s2 = pd.Series(np.random.randn(1000))