From d228d7be753dafe68385ab11f02b9c67486b7d8b Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 10 Oct 2022 13:31:42 -0500 Subject: [PATCH 01/12] Update frame.py --- pandas/core/frame.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 592b0f0ba62b8..663183c7eb894 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10605,11 +10605,12 @@ def corrwith( 0, 1 ] else: + from scipy.stats import rankdata for i, r in enumerate(ndf): nonnull_mask = ~np.isnan(r) & ~np.isnan(k) corrs[cols[i]] = np.corrcoef( - r[nonnull_mask].argsort().argsort(), - k[nonnull_mask].argsort().argsort(), + rankdata(r[nonnull_mask]), + rankdata(k[nonnull_mask]), )[0, 1] return Series(corrs) else: From e230225c2bcdf1d5dc763613cd24e6f65f8f3a1c Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 10 Oct 2022 13:38:33 -0500 Subject: [PATCH 02/12] Update test_cov_corr.py --- pandas/tests/frame/methods/test_cov_corr.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index ee9af3f436943..1b7482506dff1 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -399,6 +399,15 @@ def test_corrwith_spearman(self): expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) + @td.skip_if_no_scipy + def test_corrwith_spearman_with_tied_data(self): + # GH#48826 + df = DataFrame({"A": [2, np.nan, 8, 9], "B": [0, 1, 1, 0]}) + s = Series([0, 1, 1, 0]) + result = df.corrwith(s, method="spearman") + expected = Series([0.0, 1.0], index=["A", "B"]) + tm.assert_series_equal(result, expected) + @td.skip_if_no_scipy def test_corrwith_kendall(self): # GH#21925 From 059effd1c4bdf31e494e94360fb952582abf054a Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 10 Oct 2022 13:42:12 -0500 Subject: [PATCH 03/12] Update v1.5.1.rst --- doc/source/whatsnew/v1.5.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index addf1817fb4d6..75332a1e36ec2 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -86,6 +86,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.apply` when passing non-zero ``axis`` via keyword argument (:issue:`48656`) - Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`) - Fixed regression in :class:`ExcelWriter` where the ``book`` attribute could no longer be set; however setting this attribute is now deprecated and this ability will be removed in a future version of pandas (:issue:`48780`) +- Fixed regression in :meth:`DataFrame.corrwith` when computing correlation on tied data with ``method="spearman"`` (:issue:`48826`) .. --------------------------------------------------------------------------- From 6a898f05eea04191a64dbd61d75d260e9d50ca5c Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 10 Oct 2022 15:10:07 -0500 Subject: [PATCH 04/12] add blank line --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 663183c7eb894..4ef5b347172f9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10606,6 +10606,7 @@ def corrwith( ] else: from scipy.stats import rankdata + for i, r in enumerate(ndf): nonnull_mask = ~np.isnan(r) & ~np.isnan(k) corrs[cols[i]] = np.corrcoef( From 8fd679d1b426930c846d6c6a764294d8f14f1ae7 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 10 Oct 2022 15:47:27 -0500 Subject: [PATCH 05/12] use algos.rank_1d --- pandas/core/frame.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4ef5b347172f9..8c1619ae0d1f0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10598,20 +10598,23 @@ def corrwith( cols = self.columns ndf = self.values.transpose() k = other.values + k_mask = ~np.isnan(k) + if k.dtype == "bool": + k = k.astype("float") if method == "pearson": for i, r in enumerate(ndf): - nonnull_mask = ~np.isnan(r) & ~np.isnan(k) + nonnull_mask = ~np.isnan(r) & k_mask corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[ 0, 1 ] else: - from scipy.stats import rankdata - for i, r in enumerate(ndf): - nonnull_mask = ~np.isnan(r) & ~np.isnan(k) + nonnull_mask = ~np.isnan(r) & k_mask + if r.dtype == "bool": + r = r.astype("float") corrs[cols[i]] = np.corrcoef( - rankdata(r[nonnull_mask]), - rankdata(k[nonnull_mask]), + libalgos.rank_1d(r[nonnull_mask]), + libalgos.rank_1d(k[nonnull_mask]), )[0, 1] return Series(corrs) else: From 3528e4058efcc348007939d795877d7895b7f17f Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:22:08 -0500 Subject: [PATCH 06/12] add more tests --- pandas/tests/frame/methods/test_cov_corr.py | 88 ++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 1b7482506dff1..9e35ff4dd835b 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -355,7 +355,10 @@ def test_corrwith_mixed_dtypes(self, numeric_only): expected = Series(data=corrs, index=["a", "b"]) tm.assert_series_equal(result, expected) else: - with pytest.raises(TypeError, match="not supported for the input types"): + with pytest.raises( + TypeError, + match=r"unsupported operand type\(s\) for /: 'str' and 'int'", + ): df.corrwith(s, numeric_only=numeric_only) def test_corrwith_index_intersection(self): @@ -415,3 +418,86 @@ def test_corrwith_kendall(self): result = df.corrwith(df**2, method="kendall") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "numeric_only, ser, expected", + [ + ( + True, + Series([0, 1, 1, 0]), + Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), + ), + ( + True, + Series([0.0, 1.0, 1.0, 0.0]), + Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), + ), + ( + True, + Series([False, True, True, False]), + Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), + ), + ( + False, + Series([0, 1, 1, 0]), + Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), + ), + ( + False, + Series([0.0, 1.0, 1.0, 0.0]), + Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), + ), + ( + False, + Series([False, True, True, False]), + Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), + ), + ( + True, + Series([0, pd.NA, 1, 0], dtype="Int64"), + Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), + ), + ( + True, + Series([0.0, pd.NA, 1.0, 0.0], dtype="Float64"), + Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), + ), + ( + True, + Series([False, pd.NA, True, False], dtype="boolean"), + Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), + ), + ( + False, + Series([0, pd.NA, 1, 0], dtype="Int64"), + Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), + ), + ( + False, + Series([0, pd.NA, 1, 0], dtype="Int64"), + Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), + ), + ( + False, + Series([False, pd.NA, True, False], dtype="boolean"), + Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), + ), + ], + ) + def test_corrwith_spearman_with_tied_data(self, ser, numeric_only, expected): + # GH#21925 + df = DataFrame( + { + "A": [2, 5, 8, 9], + "B": [2, np.nan, 8, 9], + "C": [2, np.nan, 8, 9], + "D": [0, 1, 1, 0], + "E": [0, np.nan, 1, 0], + "F": [0, np.nan, 1, 0], + "G": [False, True, True, False], + "H": [False, pd.NA, True, False], + }, + ).astype({"C": "Int64", "F": "Float64", "H": "boolean"}) + s = Series([0, 1, 1, 0]) + result = df.corrwith(s, method="spearman", numeric_only=numeric_only) + tm.assert_series_equal(result, expected) From 12e923215682e7c9f0084eaf9adfa7be8786ebdf Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:23:20 -0500 Subject: [PATCH 07/12] Update frame.py --- pandas/core/frame.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8c1619ae0d1f0..499250e3bd1d2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10593,27 +10593,27 @@ def corrwith( corrs = {} if numeric_only: cols = self.select_dtypes(include=np.number).columns - ndf = self[cols].values.transpose() else: cols = self.columns - ndf = self.values.transpose() k = other.values - k_mask = ~np.isnan(k) - if k.dtype == "bool": - k = k.astype("float") + k_mask = ~other.isna() + if isinstance(k, BaseMaskedArray): + k = k._data if method == "pearson": - for i, r in enumerate(ndf): - nonnull_mask = ~np.isnan(r) & k_mask - corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[ - 0, 1 - ] + for col in cols: + val = self[col].values + nonnull_mask = ~self[col].isna() & k_mask + corrs[col] = np.corrcoef( + val[nonnull_mask], k[nonnull_mask] + )[0, 1] else: - for i, r in enumerate(ndf): - nonnull_mask = ~np.isnan(r) & k_mask - if r.dtype == "bool": - r = r.astype("float") - corrs[cols[i]] = np.corrcoef( - libalgos.rank_1d(r[nonnull_mask]), + for col in cols: + val = self[col].values + nonnull_mask = ~self[col].isna() & k_mask + if isinstance(val, BaseMaskedArray): + val = val._data + corrs[col] = np.corrcoef( + libalgos.rank_1d(val[nonnull_mask]), libalgos.rank_1d(k[nonnull_mask]), )[0, 1] return Series(corrs) From 8578581e2811698fe6f8f8919020d59b0dfbe359 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:26:17 -0500 Subject: [PATCH 08/12] convert nullable to np.array for pearson --- pandas/core/frame.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 499250e3bd1d2..c6741745f6bab 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10603,6 +10603,8 @@ def corrwith( for col in cols: val = self[col].values nonnull_mask = ~self[col].isna() & k_mask + if isinstance(val, BaseMaskedArray): + val = val._data corrs[col] = np.corrcoef( val[nonnull_mask], k[nonnull_mask] )[0, 1] From 93879dfc21875b971826d092de42daaa122b8e3b Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:27:06 -0500 Subject: [PATCH 09/12] Update test_cov_corr.py --- pandas/tests/frame/methods/test_cov_corr.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 9e35ff4dd835b..a2ad269afa159 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -402,15 +402,6 @@ def test_corrwith_spearman(self): expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) - @td.skip_if_no_scipy - def test_corrwith_spearman_with_tied_data(self): - # GH#48826 - df = DataFrame({"A": [2, np.nan, 8, 9], "B": [0, 1, 1, 0]}) - s = Series([0, 1, 1, 0]) - result = df.corrwith(s, method="spearman") - expected = Series([0.0, 1.0], index=["A", "B"]) - tm.assert_series_equal(result, expected) - @td.skip_if_no_scipy def test_corrwith_kendall(self): # GH#21925 From 8ae940381f51a9e8cd17ae96941c9dbf8ca9df7c Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Tue, 11 Oct 2022 21:17:21 -0500 Subject: [PATCH 10/12] fix import and pre-commit --- pandas/core/frame.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c6741745f6bab..b1e03c3c0d9e7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -162,6 +162,7 @@ from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( + BaseMaskedArray, DatetimeArray, ExtensionArray, PeriodArray, @@ -10605,9 +10606,9 @@ def corrwith( nonnull_mask = ~self[col].isna() & k_mask if isinstance(val, BaseMaskedArray): val = val._data - corrs[col] = np.corrcoef( - val[nonnull_mask], k[nonnull_mask] - )[0, 1] + corrs[col] = np.corrcoef(val[nonnull_mask], k[nonnull_mask])[ + 0, 1 + ] else: for col in cols: val = self[col].values From 2f3c9528a5300fa20cf10186fbc6f21a9a634033 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Thu, 13 Oct 2022 20:53:57 -0500 Subject: [PATCH 11/12] put params into tests --- pandas/tests/frame/methods/test_cov_corr.py | 93 +++++---------------- 1 file changed, 20 insertions(+), 73 deletions(-) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index a2ad269afa159..8708c330ebea7 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -410,85 +410,32 @@ def test_corrwith_kendall(self): expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "numeric_only, ser, expected", - [ - ( - True, - Series([0, 1, 1, 0]), - Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), - ), - ( - True, - Series([0.0, 1.0, 1.0, 0.0]), - Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), - ), - ( - True, - Series([False, True, True, False]), - Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), - ), - ( - False, - Series([0, 1, 1, 0]), - Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), - ), - ( - False, - Series([0.0, 1.0, 1.0, 0.0]), - Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), - ), - ( - False, - Series([False, True, True, False]), - Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), - ), - ( - True, - Series([0, pd.NA, 1, 0], dtype="Int64"), - Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), - ), - ( - True, - Series([0.0, pd.NA, 1.0, 0.0], dtype="Float64"), - Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), - ), - ( - True, - Series([False, pd.NA, True, False], dtype="boolean"), - Series([0.0] * 3 + [1.0] * 4, index=list("ABCDEFH")), - ), - ( - False, - Series([0, pd.NA, 1, 0], dtype="Int64"), - Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), - ), - ( - False, - Series([0, pd.NA, 1, 0], dtype="Int64"), - Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), - ), - ( - False, - Series([False, pd.NA, True, False], dtype="boolean"), - Series([0.0] * 3 + [1.0] * 5, index=list("ABCDEFGH")), - ), - ], - ) - def test_corrwith_spearman_with_tied_data(self, ser, numeric_only, expected): + def test_corrwith_spearman_with_tied_data(self): # GH#21925 df = DataFrame( { "A": [2, 5, 8, 9], "B": [2, np.nan, 8, 9], - "C": [2, np.nan, 8, 9], + "C": pd.Series([2, np.nan, 8, 9], dtype="Int64"), "D": [0, 1, 1, 0], "E": [0, np.nan, 1, 0], - "F": [0, np.nan, 1, 0], + "F": pd.Series([0, np.nan, 1, 0], dtype="Float64"), "G": [False, True, True, False], - "H": [False, pd.NA, True, False], + "H": pd.Series([False, pd.NA, True, False], dtype="boolean"), }, - ).astype({"C": "Int64", "F": "Float64", "H": "boolean"}) - s = Series([0, 1, 1, 0]) - result = df.corrwith(s, method="spearman", numeric_only=numeric_only) - tm.assert_series_equal(result, expected) + ) + ser_list = [ + Series([0, 1, 1, 0]), + Series([0.0, 1.0, 1.0, 0.0]), + Series([False, True, True, False]), + Series([0, pd.NA, 1, 0], dtype="Int64"), + Series([0, pd.NA, 1, 0], dtype="Float64"), + Series([False, pd.NA, True, False], dtype="boolean"), + ] + expected = Series( + [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0], + index=["A", "B", "C", "D", "E", "F", "G", "H"], + ) + for ser in ser_list: + result = df.corrwith(ser, method="spearman", numeric_only=False) + tm.assert_series_equal(result, expected) From cef9e9a98473942ee4880dc98715bb636f6be4d3 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Thu, 13 Oct 2022 21:18:11 -0500 Subject: [PATCH 12/12] remove pd namespace --- pandas/tests/frame/methods/test_cov_corr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 8708c330ebea7..2d3cc6ff815cf 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -416,12 +416,12 @@ def test_corrwith_spearman_with_tied_data(self): { "A": [2, 5, 8, 9], "B": [2, np.nan, 8, 9], - "C": pd.Series([2, np.nan, 8, 9], dtype="Int64"), + "C": Series([2, np.nan, 8, 9], dtype="Int64"), "D": [0, 1, 1, 0], "E": [0, np.nan, 1, 0], - "F": pd.Series([0, np.nan, 1, 0], dtype="Float64"), + "F": Series([0, np.nan, 1, 0], dtype="Float64"), "G": [False, True, True, False], - "H": pd.Series([False, pd.NA, True, False], dtype="boolean"), + "H": Series([False, pd.NA, True, False], dtype="boolean"), }, ) ser_list = [