Skip to content

Commit 523bf34

Browse files
Revert "PERF: faster corrwith method for pearson and spearman correlation when other is a Series and axis = 0 (column-wise) (#46174)" (#49140)
* Update frame.py * Revert "PERF: faster corrwith method for pearson and spearman correlation when other is a Series and axis = 0 (column-wise) (#46174)" This reverts commit 5efb570. * fix GH issue number in test * add test from original issue * skip if no scipy * add extra test case Co-authored-by: Yuanhao Geng <[email protected]> Co-authored-by: MarcoGorelli <>
1 parent 9415ce2 commit 523bf34

File tree

3 files changed

+29
-34
lines changed

3 files changed

+29
-34
lines changed

doc/source/whatsnew/v1.5.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ Fixed regressions
8888
- Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`)
8989
- Fixed regression in :meth:`DataFrame.to_parquet` raising when file name was specified as ``bytes`` (:issue:`48944`)
9090
- Fixed regression in :class:`ExcelWriter` where the ``book`` attribute could no longer be set; however setting this attribute is now deprecated and this ability will be removed in a future version of pandas (:issue:`48780`)
91+
- Fixed regression in :meth:`DataFrame.corrwith` when computing correlation on tied data with ``method="spearman"`` (:issue:`48826`)
9192

9293
.. ---------------------------------------------------------------------------
9394

pandas/core/frame.py

+1-33
Original file line numberDiff line numberDiff line change
@@ -10577,40 +10577,8 @@ def corrwith(
1057710577
if numeric_only is lib.no_default and len(this.columns) < len(self.columns):
1057810578
com.deprecate_numeric_only_default(type(self), "corrwith")
1057910579

10580-
# GH46174: when other is a Series object and axis=0, we achieve a speedup over
10581-
# passing .corr() to .apply() by taking the columns as ndarrays and iterating
10582-
# over the transposition row-wise. Then we delegate the correlation coefficient
10583-
# computation and null-masking to np.corrcoef and np.isnan respectively,
10584-
# which are much faster. We exploit the fact that the Spearman correlation
10585-
# of two vectors is equal to the Pearson correlation of their ranks to use
10586-
# substantially the same method for Pearson and Spearman,
10587-
# just with intermediate argsorts on the latter.
1058810580
if isinstance(other, Series):
10589-
if axis == 0 and method in ["pearson", "spearman"]:
10590-
corrs = {}
10591-
if numeric_only:
10592-
cols = self.select_dtypes(include=np.number).columns
10593-
ndf = self[cols].values.transpose()
10594-
else:
10595-
cols = self.columns
10596-
ndf = self.values.transpose()
10597-
k = other.values
10598-
if method == "pearson":
10599-
for i, r in enumerate(ndf):
10600-
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
10601-
corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[
10602-
0, 1
10603-
]
10604-
else:
10605-
for i, r in enumerate(ndf):
10606-
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
10607-
corrs[cols[i]] = np.corrcoef(
10608-
r[nonnull_mask].argsort().argsort(),
10609-
k[nonnull_mask].argsort().argsort(),
10610-
)[0, 1]
10611-
return Series(corrs)
10612-
else:
10613-
return this.apply(lambda x: other.corr(x, method=method), axis=axis)
10581+
return this.apply(lambda x: other.corr(x, method=method), axis=axis)
1061410582

1061510583
if numeric_only_bool:
1061610584
other = other._get_numeric_data()

pandas/tests/frame/methods/test_cov_corr.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,10 @@ def test_corrwith_mixed_dtypes(self, numeric_only):
355355
expected = Series(data=corrs, index=["a", "b"])
356356
tm.assert_series_equal(result, expected)
357357
else:
358-
with pytest.raises(TypeError, match="not supported for the input types"):
358+
with pytest.raises(
359+
TypeError,
360+
match=r"unsupported operand type\(s\) for /: 'str' and 'int'",
361+
):
359362
df.corrwith(s, numeric_only=numeric_only)
360363

361364
def test_corrwith_index_intersection(self):
@@ -406,3 +409,26 @@ def test_corrwith_kendall(self):
406409
result = df.corrwith(df**2, method="kendall")
407410
expected = Series(np.ones(len(result)))
408411
tm.assert_series_equal(result, expected)
412+
413+
@td.skip_if_no_scipy
414+
def test_corrwith_spearman_with_tied_data(self):
415+
# GH#48826
416+
df1 = DataFrame(
417+
{
418+
"A": [1, np.nan, 7, 8],
419+
"B": [False, True, True, False],
420+
"C": [10, 4, 9, 3],
421+
}
422+
)
423+
df2 = df1[["B", "C"]]
424+
result = (df1 + 1).corrwith(df2.B, method="spearman")
425+
expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"])
426+
tm.assert_series_equal(result, expected)
427+
428+
df_bool = DataFrame(
429+
{"A": [True, True, False, False], "B": [True, False, False, True]}
430+
)
431+
ser_bool = Series([True, True, False, True])
432+
result = df_bool.corrwith(ser_bool)
433+
expected = Series([0.57735, 0.57735], index=["A", "B"])
434+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)