From 9f8e3cffaad603b7cb4e72f0dec949e49a863ac5 Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Sun, 25 Aug 2019 19:42:19 -0500
Subject: [PATCH 01/15] Rank outside loop

---
 pandas/_libs/algos.pyx | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 038447ad252fe..e288e845dfb34 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -296,6 +296,7 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
     cdef:
         Py_ssize_t i, j, xi, yi, N, K
         ndarray[float64_t, ndim=2] result
+        ndarray[float64_t, ndim=2] ranked_mat
         ndarray[float64_t, ndim=1] maskedx
         ndarray[float64_t, ndim=1] maskedy
         ndarray[uint8_t, ndim=2] mask
@@ -307,6 +308,11 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
     result = np.empty((K, K), dtype=np.float64)
     mask = np.isfinite(mat).view(np.uint8)
 
+    ranked_mat = np.empty((N, K), dtype=np.float64)
+
+    for i in range(K):
+        ranked_mat[:, i] = rank_1d_float64(mat[:, i])
+
     for xi in range(K):
         for yi in range(xi + 1):
             nobs = 0
@@ -322,11 +328,9 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
                 j = 0
                 for i in range(N):
                     if mask[i, xi] and mask[i, yi]:
-                        maskedx[j] = mat[i, xi]
-                        maskedy[j] = mat[i, yi]
+                        maskedx[j] = ranked_mat[i, xi]
+                        maskedy[j] = ranked_mat[i, yi]
                         j += 1
-                maskedx = rank_1d_float64(maskedx)
-                maskedy = rank_1d_float64(maskedy)
 
                 mean = (nobs + 1) / 2.
 

From 08d8e394c30a83cd35a05c1f41f8c4ad94addd0e Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Mon, 26 Aug 2019 08:38:52 -0500
Subject: [PATCH 02/15] Check for same missing pattern

---
 pandas/_libs/algos.pyx | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index e288e845dfb34..af5a7d1666420 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -316,7 +316,9 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
     for xi in range(K):
         for yi in range(xi + 1):
             nobs = 0
+            same_miss_pat = True
             for i in range(N):
+                same_miss_pat &= not (mask[i, xi] ^ mask[i, yi])
                 if mask[i, xi] and mask[i, yi]:
                     nobs += 1
 
@@ -326,12 +328,17 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
                 maskedx = np.empty(nobs, dtype=np.float64)
                 maskedy = np.empty(nobs, dtype=np.float64)
                 j = 0
+
                 for i in range(N):
                     if mask[i, xi] and mask[i, yi]:
                         maskedx[j] = ranked_mat[i, xi]
                         maskedy[j] = ranked_mat[i, yi]
                         j += 1
 
+                if not same_miss_pat:
+                    maskedx = rank_1d_float64(maskedx)
+                    maskedy = rank_1d_float64(maskedy)
+
                 mean = (nobs + 1) / 2.
 
                 # now the cov numerator

From 1118a3c37d510c6fa3e0cef6952484e108979ecb Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Mon, 26 Aug 2019 12:38:18 -0500
Subject: [PATCH 03/15] Add comment

---
 pandas/_libs/algos.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index af5a7d1666420..3240216eda246 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -316,6 +316,8 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
     for xi in range(K):
         for yi in range(xi + 1):
             nobs = 0
+            # Keep track of whether the two columns have the same
+            # missing pattern, if not we need to recalculate ranks
             same_miss_pat = True
             for i in range(N):
                 same_miss_pat &= not (mask[i, xi] ^ mask[i, yi])

From 14e7f5166ef28dc71763e1a3b005f080e256e39c Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Tue, 27 Aug 2019 18:14:21 -0500
Subject: [PATCH 04/15] Avoid unnecessary ranking

---
 pandas/_libs/algos.pyx | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 3240216eda246..ccddb630427fb 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -309,21 +309,27 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
     mask = np.isfinite(mat).view(np.uint8)
 
     ranked_mat = np.empty((N, K), dtype=np.float64)
-
-    for i in range(K):
-        ranked_mat[:, i] = rank_1d_float64(mat[:, i])
+    cached_ranks = set({})
 
     for xi in range(K):
         for yi in range(xi + 1):
             nobs = 0
             # Keep track of whether the two columns have the same
-            # missing pattern, if not we need to recalculate ranks
+            # missing pattern, if so save off the ranks
             same_miss_pat = True
             for i in range(N):
                 same_miss_pat &= not (mask[i, xi] ^ mask[i, yi])
                 if mask[i, xi] and mask[i, yi]:
                     nobs += 1
 
+            if same_miss_pat:
+                if xi not in cached_ranks:
+                    ranked_mat[:, xi] = rank_1d_float64(mat[:, xi])
+                    cached_ranks.add(xi)
+                if yi not in cached_ranks:
+                    ranked_mat[:, yi] = rank_1d_float64(mat[:, yi])
+                    cached_ranks.add(yi)
+
             if nobs < minp:
                 result[xi, yi] = result[yi, xi] = NaN
             else:
@@ -333,8 +339,8 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
 
                 for i in range(N):
                     if mask[i, xi] and mask[i, yi]:
-                        maskedx[j] = ranked_mat[i, xi]
-                        maskedy[j] = ranked_mat[i, yi]
+                        maskedx[j] = ranked_mat[i, xi] if same_miss_pat else mat[i, xi]
+                        maskedy[j] = ranked_mat[i, yi] if same_miss_pat else mat[i, yi]
                         j += 1
 
                 if not same_miss_pat:

From 316999610e61971da1fabe411997518f9a76f38b Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Tue, 27 Aug 2019 19:50:37 -0500
Subject: [PATCH 05/15] Rename same_miss_pat

---
 pandas/_libs/algos.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index ccddb630427fb..2a1b55857831e 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -316,13 +316,13 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
             nobs = 0
             # Keep track of whether the two columns have the same
             # missing pattern, if so save off the ranks
-            same_miss_pat = True
+            all_ranks = True
             for i in range(N):
-                same_miss_pat &= not (mask[i, xi] ^ mask[i, yi])
+                all_ranks &= not (mask[i, xi] ^ mask[i, yi])
                 if mask[i, xi] and mask[i, yi]:
                     nobs += 1
 
-            if same_miss_pat:
+            if all_ranks:
                 if xi not in cached_ranks:
                     ranked_mat[:, xi] = rank_1d_float64(mat[:, xi])
                     cached_ranks.add(xi)
@@ -339,11 +339,11 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
 
                 for i in range(N):
                     if mask[i, xi] and mask[i, yi]:
-                        maskedx[j] = ranked_mat[i, xi] if same_miss_pat else mat[i, xi]
-                        maskedy[j] = ranked_mat[i, yi] if same_miss_pat else mat[i, yi]
+                        maskedx[j] = ranked_mat[i, xi] if all_ranks else mat[i, xi]
+                        maskedy[j] = ranked_mat[i, yi] if all_ranks else mat[i, yi]
                         j += 1
 
-                if not same_miss_pat:
+                if not all_ranks:
                     maskedx = rank_1d_float64(maskedx)
                     maskedy = rank_1d_float64(maskedy)
 

From 23402261fb718d540271ef0884b5bf14f9161b43 Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Tue, 3 Sep 2019 19:30:45 -0500
Subject: [PATCH 06/15] Add wide frame to benchmarks

---
 asv_bench/benchmarks/stat_ops.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index 620a6de0f5f34..1428b5d496d6d 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -113,12 +113,16 @@ def setup(self, method, use_bottleneck):
             nanops._USE_BOTTLENECK = use_bottleneck
         self.df = pd.DataFrame(np.random.randn(1000, 30))
         self.df2 = pd.DataFrame(np.random.randn(1000, 30))
+        self.df_wide = pd.DataFrame(np.random.randn(1000, 500))
         self.s = pd.Series(np.random.randn(1000))
         self.s2 = pd.Series(np.random.randn(1000))
 
     def time_corr(self, method, use_bottleneck):
         self.df.corr(method=method)
 
+    def time_corr_wide(self, method, use_bottleneck):
+        self.df_wide.corr(method=method)
+
     def time_corr_series(self, method, use_bottleneck):
         self.s.corr(self.s2, method=method)
 

From 913193d7d80b6ac135501fb67a2421d5a40ea0fc Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Tue, 3 Sep 2019 19:41:26 -0500
Subject: [PATCH 07/15] Add performance note

---
 doc/source/whatsnew/v1.0.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 8e25857e5ad69..6d5d8cd36ba25 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -76,7 +76,7 @@ Performance improvements
 - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`)
 - Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`)
 - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`)
-
+- Performance improvement in :meth:`DataFrame.corr` when ``method`` is ``"spearman"`` (:issue:`28139`)
 
 .. _whatsnew_1000.bug_fixes:
 

From 30cfc388a755c9f2a886ca1c0fa26e88e68d504e Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Tue, 3 Sep 2019 20:54:43 -0500
Subject: [PATCH 08/15] Add peakmem benchmark

---
 asv_bench/benchmarks/stat_ops.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index 1428b5d496d6d..a20835010e4eb 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -123,6 +123,9 @@ def time_corr(self, method, use_bottleneck):
     def time_corr_wide(self, method, use_bottleneck):
         self.df_wide.corr(method=method)
 
+    def peakmem_corr_wide(self, method, use_bottleneck):
+        self.df_wide.corr(method=method)
+
     def time_corr_series(self, method, use_bottleneck):
         self.s.corr(self.s2, method=method)
 

From 420b1d6af8c92501a8b2366225ffd3d0887b6da8 Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Tue, 3 Sep 2019 21:48:20 -0500
Subject: [PATCH 09/15] Add nans to benchmark data

---
 asv_bench/benchmarks/stat_ops.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index a20835010e4eb..c971ce7b54369 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -114,6 +114,7 @@ def setup(self, method, use_bottleneck):
         self.df = pd.DataFrame(np.random.randn(1000, 30))
         self.df2 = pd.DataFrame(np.random.randn(1000, 30))
         self.df_wide = pd.DataFrame(np.random.randn(1000, 500))
+        self.df_wide_nans = self.df_wide.where(np.random.random((1000, 500)) < 0.9)
         self.s = pd.Series(np.random.randn(1000))
         self.s2 = pd.Series(np.random.randn(1000))
 
@@ -123,6 +124,9 @@ def time_corr(self, method, use_bottleneck):
     def time_corr_wide(self, method, use_bottleneck):
         self.df_wide.corr(method=method)
 
+    def time_corr_wide_nans(self, method, use_bottleneck):
+        self.df_wide_nans.corr(method=method)
+
     def peakmem_corr_wide(self, method, use_bottleneck):
         self.df_wide.corr(method=method)
 

From 15f70e22b41599488dcff0a0a61fe706d73937d9 Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Thu, 5 Sep 2019 17:50:23 -0500
Subject: [PATCH 10/15] Revert "Avoid unnecessary ranking"

This reverts commit 14e7f5166ef28dc71763e1a3b005f080e256e39c.
---
 pandas/_libs/algos.pyx | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 2a1b55857831e..3171553a279aa 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -309,7 +309,9 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
     mask = np.isfinite(mat).view(np.uint8)
 
     ranked_mat = np.empty((N, K), dtype=np.float64)
-    cached_ranks = set({})
+
+    for i in range(K):
+        ranked_mat[:, i] = rank_1d_float64(mat[:, i])
 
     for xi in range(K):
         for yi in range(xi + 1):
@@ -322,14 +324,6 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
                 if mask[i, xi] and mask[i, yi]:
                     nobs += 1
 
-            if all_ranks:
-                if xi not in cached_ranks:
-                    ranked_mat[:, xi] = rank_1d_float64(mat[:, xi])
-                    cached_ranks.add(xi)
-                if yi not in cached_ranks:
-                    ranked_mat[:, yi] = rank_1d_float64(mat[:, yi])
-                    cached_ranks.add(yi)
-
             if nobs < minp:
                 result[xi, yi] = result[yi, xi] = NaN
             else:
@@ -339,8 +333,8 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
 
                 for i in range(N):
                     if mask[i, xi] and mask[i, yi]:
-                        maskedx[j] = ranked_mat[i, xi] if all_ranks else mat[i, xi]
-                        maskedy[j] = ranked_mat[i, yi] if all_ranks else mat[i, yi]
+                        maskedx[j] = ranked_mat[i, xi]
+                        maskedy[j] = ranked_mat[i, yi]
                         j += 1
 
                 if not all_ranks:

From ae3e3380efc48f21ce57770e537754b2ac8e0f65 Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Thu, 5 Sep 2019 18:57:16 -0500
Subject: [PATCH 11/15] Edit comment

---
 pandas/_libs/algos.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 3171553a279aa..0f91f612994c7 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -316,8 +316,7 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
     for xi in range(K):
         for yi in range(xi + 1):
             nobs = 0
-            # Keep track of whether the two columns have the same
-            # missing pattern, if so save off the ranks
+            # Keep track of whether we need to recompute ranks
             all_ranks = True
             for i in range(N):
                 all_ranks &= not (mask[i, xi] ^ mask[i, yi])

From 01cb456bc11e982453e2718de83526ea256532d7 Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Thu, 5 Sep 2019 19:02:55 -0500
Subject: [PATCH 12/15] Raise for Kendall benchmark

---
 asv_bench/benchmarks/stat_ops.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index 3cf1e9f88764b..de2ebd8068f0a 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -122,9 +122,15 @@ def time_corr(self, method, use_bottleneck):
         self.df.corr(method=method)
 
     def time_corr_wide(self, method, use_bottleneck):
+        # Raise here until benchmark can pass
+        if method == "kendall":
+            raise NotImplementedError
         self.df_wide.corr(method=method)
 
     def time_corr_wide_nans(self, method, use_bottleneck):
+        # Raise here until benchmark can pass
+        if method == "kendall":
+            raise NotImplementedError
         self.df_wide_nans.corr(method=method)
 
     def peakmem_corr_wide(self, method, use_bottleneck):

From 3e2da442898ea09675e3e01bd85f262b66a491f2 Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Thu, 5 Sep 2019 19:36:10 -0500
Subject: [PATCH 13/15] Try returning None

---
 asv_bench/benchmarks/stat_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index de2ebd8068f0a..182ad4bd3a303 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -124,13 +124,13 @@ def time_corr(self, method, use_bottleneck):
     def time_corr_wide(self, method, use_bottleneck):
         # Raise here until benchmark can pass
         if method == "kendall":
-            raise NotImplementedError
+            return None
         self.df_wide.corr(method=method)
 
     def time_corr_wide_nans(self, method, use_bottleneck):
         # Raise here until benchmark can pass
         if method == "kendall":
-            raise NotImplementedError
+            return None
         self.df_wide_nans.corr(method=method)
 
     def peakmem_corr_wide(self, method, use_bottleneck):

From 738433a416148f9d60fd4fce143b58791349e1c7 Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Fri, 6 Sep 2019 18:30:38 -0500
Subject: [PATCH 14/15] Fix benchmark

---
 asv_bench/benchmarks/stat_ops.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index 182ad4bd3a303..825277728b19a 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -113,8 +113,8 @@ def setup(self, method, use_bottleneck):
             nanops._USE_BOTTLENECK = use_bottleneck
         self.df = pd.DataFrame(np.random.randn(1000, 30))
         self.df2 = pd.DataFrame(np.random.randn(1000, 30))
-        self.df_wide = pd.DataFrame(np.random.randn(1000, 500))
-        self.df_wide_nans = self.df_wide.where(np.random.random((1000, 500)) < 0.9)
+        self.df_wide = pd.DataFrame(np.random.randn(1000, 300))
+        self.df_wide_nans = self.df_wide.where(np.random.random((1000, 300)) < 0.9)
         self.s = pd.Series(np.random.randn(1000))
         self.s2 = pd.Series(np.random.randn(1000))
 
@@ -122,15 +122,9 @@ def time_corr(self, method, use_bottleneck):
         self.df.corr(method=method)
 
     def time_corr_wide(self, method, use_bottleneck):
-        # Raise here until benchmark can pass
-        if method == "kendall":
-            return None
         self.df_wide.corr(method=method)
 
     def time_corr_wide_nans(self, method, use_bottleneck):
-        # Raise here until benchmark can pass
-        if method == "kendall":
-            return None
         self.df_wide_nans.corr(method=method)
 
     def peakmem_corr_wide(self, method, use_bottleneck):

From 92d813429502f0439b4a6b20273be4bf24b42dcb Mon Sep 17 00:00:00 2001
From: Daniel Saxton <>
Date: Fri, 6 Sep 2019 20:30:51 -0500
Subject: [PATCH 15/15] Update benchmark data size

---
 asv_bench/benchmarks/stat_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index 825277728b19a..ed5ebfa61594e 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -113,8 +113,8 @@ def setup(self, method, use_bottleneck):
             nanops._USE_BOTTLENECK = use_bottleneck
         self.df = pd.DataFrame(np.random.randn(1000, 30))
         self.df2 = pd.DataFrame(np.random.randn(1000, 30))
-        self.df_wide = pd.DataFrame(np.random.randn(1000, 300))
-        self.df_wide_nans = self.df_wide.where(np.random.random((1000, 300)) < 0.9)
+        self.df_wide = pd.DataFrame(np.random.randn(1000, 200))
+        self.df_wide_nans = self.df_wide.where(np.random.random((1000, 200)) < 0.9)
         self.s = pd.Series(np.random.randn(1000))
         self.s2 = pd.Series(np.random.randn(1000))