diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index addf1817fb4d6..75332a1e36ec2 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -86,6 +86,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.apply` when passing non-zero ``axis`` via keyword argument (:issue:`48656`) - Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`) - Fixed regression in :class:`ExcelWriter` where the ``book`` attribute could no longer be set; however setting this attribute is now deprecated and this ability will be removed in a future version of pandas (:issue:`48780`) +- Fixed regression in :meth:`DataFrame.corrwith` when computing correlation on tied data with ``method="spearman"`` (:issue:`48826`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 592b0f0ba62b8..b1e03c3c0d9e7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -162,6 +162,7 @@ from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( + BaseMaskedArray, DatetimeArray, ExtensionArray, PeriodArray, @@ -10593,23 +10594,30 @@ def corrwith( corrs = {} if numeric_only: cols = self.select_dtypes(include=np.number).columns - ndf = self[cols].values.transpose() else: cols = self.columns - ndf = self.values.transpose() k = other.values + k_mask = ~other.isna() + if isinstance(k, BaseMaskedArray): + k = k._data if method == "pearson": - for i, r in enumerate(ndf): - nonnull_mask = ~np.isnan(r) & ~np.isnan(k) - corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[ + for col in cols: + val = self[col].values + nonnull_mask = ~self[col].isna() & k_mask + if isinstance(val, BaseMaskedArray): + val = val._data + corrs[col] = np.corrcoef(val[nonnull_mask], k[nonnull_mask])[ 0, 1 ] else: - for i, r in enumerate(ndf): - nonnull_mask = ~np.isnan(r) & ~np.isnan(k) - corrs[cols[i]] = np.corrcoef( - r[nonnull_mask].argsort().argsort(), - k[nonnull_mask].argsort().argsort(), + for col in cols: + val = self[col].values + nonnull_mask = ~self[col].isna() & k_mask + if isinstance(val, BaseMaskedArray): + val = val._data + corrs[col] = np.corrcoef( + libalgos.rank_1d(val[nonnull_mask]), + libalgos.rank_1d(k[nonnull_mask]), )[0, 1] return Series(corrs) else: diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index ee9af3f436943..2d3cc6ff815cf 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -355,7 +355,10 @@ def test_corrwith_mixed_dtypes(self, numeric_only): expected = Series(data=corrs, index=["a", "b"]) tm.assert_series_equal(result, expected) else: - with pytest.raises(TypeError, match="not supported for the input types"): + with pytest.raises( + TypeError, + match=r"unsupported operand type\(s\) for /: 'str' and 'int'", + ): df.corrwith(s, numeric_only=numeric_only) def test_corrwith_index_intersection(self): @@ -406,3 +409,33 @@ def test_corrwith_kendall(self): result = df.corrwith(df**2, method="kendall") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) + + def test_corrwith_spearman_with_tied_data(self): + # GH#21925 + df = DataFrame( + { + "A": [2, 5, 8, 9], + "B": [2, np.nan, 8, 9], + "C": Series([2, np.nan, 8, 9], dtype="Int64"), + "D": [0, 1, 1, 0], + "E": [0, np.nan, 1, 0], + "F": Series([0, np.nan, 1, 0], dtype="Float64"), + "G": [False, True, True, False], + "H": Series([False, pd.NA, True, False], dtype="boolean"), + }, + ) + ser_list = [ + Series([0, 1, 1, 0]), + Series([0.0, 1.0, 1.0, 0.0]), + Series([False, True, True, False]), + Series([0, pd.NA, 1, 0], dtype="Int64"), + Series([0, pd.NA, 1, 0], dtype="Float64"), + Series([False, pd.NA, True, False], dtype="boolean"), + ] + expected = Series( + [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0], + index=["A", "B", "C", "D", "E", "F", "G", "H"], + ) + for ser in ser_list: + result = df.corrwith(ser, method="spearman", numeric_only=False) + tm.assert_series_equal(result, expected)