PERF: nancorr_spearman fastpath (pandas-dev#41885)

mzeitlin11 · JulianWgs · commit 0c086feaa6b3 · 2021-07-03T13:10:23.000+02:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -842,7 +842,7 @@ Performance improvements
 - Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`)
 - Performance improvement in :meth:`DataFrame.fillna` with ``method="pad"`` or ``method="backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`)
 - Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`)
-- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`)
+- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`, :issue:`41885`)
 - Performance improvement in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` (:issue:`39388`)
 - Performance improvement in :meth:`.RollingGroupby.corr`, :meth:`.ExpandingGroupby.corr`, :meth:`.ExpandingGroupby.corr` and :meth:`.ExpandingGroupby.cov` (:issue:`39591`)
 - Performance improvement in :func:`unique` for object data type (:issue:`37615`)
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -387,15 +387,23 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
         float64_t[::1] maskedx, maskedy
         ndarray[uint8_t, ndim=2] mask
         int64_t nobs = 0
+        bint no_nans
         float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
         const int64_t[:] labels_n, labels_nobs
 
     N, K = (<object>mat).shape
     # For compatibility when calling rank_1d
     labels_n = np.zeros(N, dtype=np.int64)
 
+    # Handle the edge case where we know all results will be nan
+    # to keep conditional logic inside loop simpler
+    if N < minp:
+        result = np.full((K, K), np.nan, dtype=np.float64)
+        return result
+
     result = np.empty((K, K), dtype=np.float64)
     mask = np.isfinite(mat).view(np.uint8)
+    no_nans = mask.all()
 
     ranked_mat = np.empty((N, K), dtype=np.float64)
 
@@ -409,51 +417,66 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
     with nogil:
         for xi in range(K):
             for yi in range(xi + 1):
-                nobs = 0
-                # Keep track of whether we need to recompute ranks
-                all_ranks = True
-                for i in range(N):
-                    all_ranks &= not (mask[i, xi] ^ mask[i, yi])
-                    if mask[i, xi] and mask[i, yi]:
-                        maskedx[nobs] = ranked_mat[i, xi]
-                        maskedy[nobs] = ranked_mat[i, yi]
-                        nobs += 1
-
-                if nobs < minp:
-                    result[xi, yi] = result[yi, xi] = NaN
-                else:
-                    if not all_ranks:
-                        with gil:
-                            # We need to slice back to nobs because rank_1d will
-                            # require arrays of nobs length
-                            labels_nobs = np.zeros(nobs, dtype=np.int64)
-                            rankedx = rank_1d(np.array(maskedx)[:nobs],
-                                              labels=labels_nobs)
-                            rankedy = rank_1d(np.array(maskedy)[:nobs],
-                                              labels=labels_nobs)
-                        for i in range(nobs):
-                            maskedx[i] = rankedx[i]
-                            maskedy[i] = rankedy[i]
+                sumx = sumxx = sumyy = 0
 
-                    mean = (nobs + 1) / 2.
+                # Fastpath for data with no nans/infs, allows avoiding mask checks
+                # and array reassignments
+                if no_nans:
+                    mean = (N + 1) / 2.
 
                     # now the cov numerator
-                    sumx = sumxx = sumyy = 0
-
-                    for i in range(nobs):
-                        vx = maskedx[i] - mean
-                        vy = maskedy[i] - mean
+                    for i in range(N):
+                        vx = ranked_mat[i, xi] - mean
+                        vy = ranked_mat[i, yi] - mean
 
                         sumx += vx * vy
                         sumxx += vx * vx
                         sumyy += vy * vy
+                else:
+                    nobs = 0
+                    # Keep track of whether we need to recompute ranks
+                    all_ranks = True
+                    for i in range(N):
+                        all_ranks &= not (mask[i, xi] ^ mask[i, yi])
+                        if mask[i, xi] and mask[i, yi]:
+                            maskedx[nobs] = ranked_mat[i, xi]
+                            maskedy[nobs] = ranked_mat[i, yi]
+                            nobs += 1
+
+                    if nobs < minp:
+                        result[xi, yi] = result[yi, xi] = NaN
+                        continue
+                    else:
+                        if not all_ranks:
+                            with gil:
+                                # We need to slice back to nobs because rank_1d will
+                                # require arrays of nobs length
+                                labels_nobs = np.zeros(nobs, dtype=np.int64)
+                                rankedx = rank_1d(np.array(maskedx)[:nobs],
+                                                  labels=labels_nobs)
+                                rankedy = rank_1d(np.array(maskedy)[:nobs],
+                                                  labels=labels_nobs)
+                            for i in range(nobs):
+                                maskedx[i] = rankedx[i]
+                                maskedy[i] = rankedy[i]
+
+                        mean = (nobs + 1) / 2.
+
+                        # now the cov numerator
+                        for i in range(nobs):
+                            vx = maskedx[i] - mean
+                            vy = maskedy[i] - mean
 
-                    divisor = sqrt(sumxx * sumyy)
+                            sumx += vx * vy
+                            sumxx += vx * vx
+                            sumyy += vy * vy
 
-                    if divisor != 0:
-                        result[xi, yi] = result[yi, xi] = sumx / divisor
-                    else:
-                        result[xi, yi] = result[yi, xi] = NaN
+                divisor = sqrt(sumxx * sumyy)
+
+                if divisor != 0:
+                    result[xi, yi] = result[yi, xi] = sumx / divisor
+                else:
+                    result[xi, yi] = result[yi, xi] = NaN
 
     return result
 
diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
@@ -232,6 +232,16 @@ def test_calc_corr_small_numbers(self):
         expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
         tm.assert_frame_equal(result, expected)
 
+    @td.skip_if_no_scipy
+    @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
+    def test_corr_min_periods_greater_than_length(self, method):
+        df = DataFrame({"A": [1, 2], "B": [1, 2]})
+        result = df.corr(method=method, min_periods=3)
+        expected = DataFrame(
+            {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
+        )
+        tm.assert_frame_equal(result, expected)
+
 
 class TestDataFrameCorrWith:
     def test_corrwith(self, datetime_frame):