Skip to content

Commit 08fa73f

Browse files
GYHHAHAMarcoGorelli
authored and
MarcoGorelli
committed
Update frame.py
1 parent 8608ac9 commit 08fa73f

File tree

3 files changed

+53
-11
lines changed

3 files changed

+53
-11
lines changed

doc/source/whatsnew/v1.5.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ Fixed regressions
8888
- Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`)
8989
- Fixed regression in :meth:`DataFrame.to_parquet` raising when file name was specified as ``bytes`` (:issue:`48944`)
9090
- Fixed regression in :class:`ExcelWriter` where the ``book`` attribute could no longer be set; however setting this attribute is now deprecated and this ability will be removed in a future version of pandas (:issue:`48780`)
91+
- Fixed regression in :meth:`DataFrame.corrwith` when computing correlation on tied data with ``method="spearman"`` (:issue:`48826`)
9192

9293
.. ---------------------------------------------------------------------------
9394

pandas/core/frame.py

+18-10
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@
161161
from pandas.core.array_algos.take import take_2d_multi
162162
from pandas.core.arraylike import OpsMixin
163163
from pandas.core.arrays import (
164+
BaseMaskedArray,
164165
DatetimeArray,
165166
ExtensionArray,
166167
PeriodArray,
@@ -10590,23 +10591,30 @@ def corrwith(
1059010591
corrs = {}
1059110592
if numeric_only:
1059210593
cols = self.select_dtypes(include=np.number).columns
10593-
ndf = self[cols].values.transpose()
1059410594
else:
1059510595
cols = self.columns
10596-
ndf = self.values.transpose()
1059710596
k = other.values
10597+
k_mask = ~other.isna()
10598+
if isinstance(k, BaseMaskedArray):
10599+
k = k._data
1059810600
if method == "pearson":
10599-
for i, r in enumerate(ndf):
10600-
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
10601-
corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[
10601+
for col in cols:
10602+
val = self[col].values
10603+
nonnull_mask = ~self[col].isna() & k_mask
10604+
if isinstance(val, BaseMaskedArray):
10605+
val = val._data
10606+
corrs[col] = np.corrcoef(val[nonnull_mask], k[nonnull_mask])[
1060210607
0, 1
1060310608
]
1060410609
else:
10605-
for i, r in enumerate(ndf):
10606-
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
10607-
corrs[cols[i]] = np.corrcoef(
10608-
r[nonnull_mask].argsort().argsort(),
10609-
k[nonnull_mask].argsort().argsort(),
10610+
for col in cols:
10611+
val = self[col].values
10612+
nonnull_mask = ~self[col].isna() & k_mask
10613+
if isinstance(val, BaseMaskedArray):
10614+
val = val._data
10615+
corrs[col] = np.corrcoef(
10616+
libalgos.rank_1d(val[nonnull_mask]),
10617+
libalgos.rank_1d(k[nonnull_mask]),
1061010618
)[0, 1]
1061110619
return Series(corrs)
1061210620
else:

pandas/tests/frame/methods/test_cov_corr.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,10 @@ def test_corrwith_mixed_dtypes(self, numeric_only):
355355
expected = Series(data=corrs, index=["a", "b"])
356356
tm.assert_series_equal(result, expected)
357357
else:
358-
with pytest.raises(TypeError, match="not supported for the input types"):
358+
with pytest.raises(
359+
TypeError,
360+
match=r"unsupported operand type\(s\) for /: 'str' and 'int'",
361+
):
359362
df.corrwith(s, numeric_only=numeric_only)
360363

361364
def test_corrwith_index_intersection(self):
@@ -406,3 +409,33 @@ def test_corrwith_kendall(self):
406409
result = df.corrwith(df**2, method="kendall")
407410
expected = Series(np.ones(len(result)))
408411
tm.assert_series_equal(result, expected)
412+
413+
def test_corrwith_spearman_with_tied_data(self):
414+
# GH#21925
415+
df = DataFrame(
416+
{
417+
"A": [2, 5, 8, 9],
418+
"B": [2, np.nan, 8, 9],
419+
"C": Series([2, np.nan, 8, 9], dtype="Int64"),
420+
"D": [0, 1, 1, 0],
421+
"E": [0, np.nan, 1, 0],
422+
"F": Series([0, np.nan, 1, 0], dtype="Float64"),
423+
"G": [False, True, True, False],
424+
"H": Series([False, pd.NA, True, False], dtype="boolean"),
425+
},
426+
)
427+
ser_list = [
428+
Series([0, 1, 1, 0]),
429+
Series([0.0, 1.0, 1.0, 0.0]),
430+
Series([False, True, True, False]),
431+
Series([0, pd.NA, 1, 0], dtype="Int64"),
432+
Series([0, pd.NA, 1, 0], dtype="Float64"),
433+
Series([False, pd.NA, True, False], dtype="boolean"),
434+
]
435+
expected = Series(
436+
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0],
437+
index=["A", "B", "C", "D", "E", "F", "G", "H"],
438+
)
439+
for ser in ser_list:
440+
result = df.corrwith(ser, method="spearman", numeric_only=False)
441+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)