Skip to content

REG: fix regression in df.corrwith on tied data when method is spearman #49032

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ Fixed regressions
- Fixed regression in :meth:`DataFrame.apply` when passing non-zero ``axis`` via keyword argument (:issue:`48656`)
- Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`)
- Fixed regression in :class:`ExcelWriter` where the ``book`` attribute could no longer be set; however setting this attribute is now deprecated and this ability will be removed in a future version of pandas (:issue:`48780`)
- Fixed regression in :meth:`DataFrame.corrwith` when computing correlation on tied data with ``method="spearman"`` (:issue:`48826`)

.. ---------------------------------------------------------------------------

Expand Down
28 changes: 18 additions & 10 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@
from pandas.core.array_algos.take import take_2d_multi
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import (
BaseMaskedArray,
DatetimeArray,
ExtensionArray,
PeriodArray,
Expand Down Expand Up @@ -10593,23 +10594,30 @@ def corrwith(
corrs = {}
if numeric_only:
cols = self.select_dtypes(include=np.number).columns
ndf = self[cols].values.transpose()
else:
cols = self.columns
ndf = self.values.transpose()
k = other.values
k_mask = ~other.isna()
if isinstance(k, BaseMaskedArray):
k = k._data
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we special Casing here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nullable arrays are not supported by rank_1d.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use the mask too in that case

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean we still cast the type, but write rank step as the following?

libalgos.rank_1d(k, mask=nonnull_mask)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant more like k_mask = k._mask

if method == "pearson":
for i, r in enumerate(ndf):
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[
for col in cols:
val = self[col].values
nonnull_mask = ~self[col].isna() & k_mask
if isinstance(val, BaseMaskedArray):
val = val._data
corrs[col] = np.corrcoef(val[nonnull_mask], k[nonnull_mask])[
0, 1
]
else:
for i, r in enumerate(ndf):
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
corrs[cols[i]] = np.corrcoef(
r[nonnull_mask].argsort().argsort(),
k[nonnull_mask].argsort().argsort(),
for col in cols:
val = self[col].values
nonnull_mask = ~self[col].isna() & k_mask
if isinstance(val, BaseMaskedArray):
val = val._data
corrs[col] = np.corrcoef(
libalgos.rank_1d(val[nonnull_mask]),
libalgos.rank_1d(k[nonnull_mask]),
)[0, 1]
return Series(corrs)
else:
Expand Down
35 changes: 34 additions & 1 deletion pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,10 @@ def test_corrwith_mixed_dtypes(self, numeric_only):
expected = Series(data=corrs, index=["a", "b"])
tm.assert_series_equal(result, expected)
else:
with pytest.raises(TypeError, match="not supported for the input types"):
with pytest.raises(
TypeError,
match=r"unsupported operand type\(s\) for /: 'str' and 'int'",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now this is raised by np.corrcoef.

):
df.corrwith(s, numeric_only=numeric_only)

def test_corrwith_index_intersection(self):
Expand Down Expand Up @@ -406,3 +409,33 @@ def test_corrwith_kendall(self):
result = df.corrwith(df**2, method="kendall")
expected = Series(np.ones(len(result)))
tm.assert_series_equal(result, expected)

def test_corrwith_spearman_with_tied_data(self):
# GH#21925
df = DataFrame(
{
"A": [2, 5, 8, 9],
"B": [2, np.nan, 8, 9],
"C": Series([2, np.nan, 8, 9], dtype="Int64"),
"D": [0, 1, 1, 0],
"E": [0, np.nan, 1, 0],
"F": Series([0, np.nan, 1, 0], dtype="Float64"),
"G": [False, True, True, False],
"H": Series([False, pd.NA, True, False], dtype="boolean"),
},
)
ser_list = [
Series([0, 1, 1, 0]),
Series([0.0, 1.0, 1.0, 0.0]),
Series([False, True, True, False]),
Series([0, pd.NA, 1, 0], dtype="Int64"),
Series([0, pd.NA, 1, 0], dtype="Float64"),
Series([False, pd.NA, True, False], dtype="boolean"),
]
expected = Series(
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0],
index=["A", "B", "C", "D", "E", "F", "G", "H"],
)
for ser in ser_list:
result = df.corrwith(ser, method="spearman", numeric_only=False)
tm.assert_series_equal(result, expected)