From 36be92c69e80f665ace57f065d5b2ff578395722 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 26 Apr 2020 11:22:18 -0500 Subject: [PATCH 1/7] BUG: Don't raise in DataFrame.corr with pd.NA --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/frame.py | 10 +++++++-- pandas/tests/frame/methods/test_cov_corr.py | 24 +++++++++++++++++++++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 845f7773c263c..8529105e8b93c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -520,7 +520,7 @@ Numeric - Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`) - Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`) - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) -- +- Bug in :meth:`DataFrame.corr` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d68cadbc75675..ab481b59bb577 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7834,7 +7834,10 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame": numeric_df = self._get_numeric_data() cols = numeric_df.columns idx = cols.copy() - mat = numeric_df.values + mat = numeric_df.to_numpy() + if is_object_dtype(mat.dtype): + # We end up with an object array if pd.NA is present + mat[isna(mat)] = np.nan if method == "pearson": correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods) @@ -7969,7 +7972,10 @@ def cov(self, min_periods=None) -> "DataFrame": numeric_df = self._get_numeric_data() cols = numeric_df.columns idx = cols.copy() - mat = numeric_df.values + mat = numeric_df.to_numpy() + if is_object_dtype(mat.dtype): + # We end up with an object array if pd.NA is present + mat[isna(mat)] = np.nan if notna(mat).all(): if min_periods is not None and min_periods > len(mat): diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 5c13b60aae0d0..3ffc8ec10c344 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -58,6 +58,16 @@ def test_cov(self, float_frame, float_string_frame): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])] + ) + def test_cov_nullable_integer(self, other_column): + data = pd.DataFrame({"a": pd.array([1, 2, None]), "b": other_column}) + result = data.cov() + arr = np.array([[0.5, 0.5], [0.5, 1.0]]) + expected = pd.DataFrame(arr, columns=["a", "b"], index=["a", "b"]) + tm.assert_frame_equal(result, expected) + class TestDataFrameCorr: # DataFrame.corr(), as opposed to DataFrame.corrwith @@ -153,6 +163,20 @@ def test_corr_int(self): df3.cov() df3.corr() + @pytest.mark.parametrize( + "nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])] + ) + @pytest.mark.parametrize( + "other_column", + [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])], + ) + @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) + def test_corr_nullable_integer(self, nullable_column, other_column, method): + data = pd.DataFrame({"a": nullable_column, "b": other_column}) + result = data.corr(method=method) + expected = pd.DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) + tm.assert_frame_equal(result, expected) + class TestDataFrameCorrWith: def test_corrwith(self, datetime_frame): From 30df6b644ac059740ff42188a488941e2806f9e3 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 26 Apr 2020 13:58:10 -0500 Subject: [PATCH 2/7] whatsnew and style --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/frame.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8529105e8b93c..e8d51c3423231 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -520,7 +520,7 @@ Numeric - Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`) - Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`) - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) -- Bug in :meth:`DataFrame.corr` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) +- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ab481b59bb577..161bdc3276188 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7979,15 +7979,15 @@ def cov(self, min_periods=None) -> "DataFrame": if notna(mat).all(): if min_periods is not None and min_periods > len(mat): - baseCov = np.empty((mat.shape[1], mat.shape[1])) - baseCov.fill(np.nan) + base_cov = np.empty((mat.shape[1], mat.shape[1])) + base_cov.fill(np.nan) else: - baseCov = np.cov(mat.T) - baseCov = baseCov.reshape((len(cols), len(cols))) + base_cov = np.cov(mat.T) + base_cov = base_cov.reshape((len(cols), len(cols))) else: - baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods) + base_cov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods) - return self._constructor(baseCov, index=idx, columns=cols) + return self._constructor(base_cov, index=idx, columns=cols) def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series: """ From 9dba02a22dc4527955d5a45a05b3f24ea48b9e7d Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 26 Apr 2020 14:22:05 -0500 Subject: [PATCH 3/7] Skip scipy --- pandas/tests/frame/methods/test_cov_corr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 3ffc8ec10c344..75c65d9009340 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -163,6 +163,7 @@ def test_corr_int(self): df3.cov() df3.corr() + @td.skip_if_no_scipy @pytest.mark.parametrize( "nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])] ) From 8e56a3f67f0461bef71f318b769c3bdf124536e5 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 26 Apr 2020 19:02:21 -0500 Subject: [PATCH 4/7] Issue number --- pandas/tests/frame/methods/test_cov_corr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 75c65d9009340..7d75db55c3073 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -62,6 +62,7 @@ def test_cov(self, float_frame, float_string_frame): "other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])] ) def test_cov_nullable_integer(self, other_column): + # https://github.com/pandas-dev/pandas/issues/33803 data = pd.DataFrame({"a": pd.array([1, 2, None]), "b": other_column}) result = data.cov() arr = np.array([[0.5, 0.5], [0.5, 1.0]]) @@ -173,6 +174,7 @@ def test_corr_int(self): ) @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) def test_corr_nullable_integer(self, nullable_column, other_column, method): + # https://github.com/pandas-dev/pandas/issues/33803 data = pd.DataFrame({"a": nullable_column, "b": other_column}) result = data.corr(method=method) expected = pd.DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) From 8237cb103357365aa20b3585282d99cae0eda299 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 27 Apr 2020 11:20:10 -0500 Subject: [PATCH 5/7] astype to float --- pandas/core/frame.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 161bdc3276188..cc1972bb9c0ec 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7834,10 +7834,7 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame": numeric_df = self._get_numeric_data() cols = numeric_df.columns idx = cols.copy() - mat = numeric_df.to_numpy() - if is_object_dtype(mat.dtype): - # We end up with an object array if pd.NA is present - mat[isna(mat)] = np.nan + mat = numeric_df.astype(float).to_numpy() if method == "pearson": correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods) @@ -7972,10 +7969,7 @@ def cov(self, min_periods=None) -> "DataFrame": numeric_df = self._get_numeric_data() cols = numeric_df.columns idx = cols.copy() - mat = numeric_df.to_numpy() - if is_object_dtype(mat.dtype): - # We end up with an object array if pd.NA is present - mat[isna(mat)] = np.nan + mat = numeric_df.astype(float).to_numpy() if notna(mat).all(): if min_periods is not None and min_periods > len(mat): From c2dd554c99a573b8c4c834b77408c45c82d1f01c Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 27 Apr 2020 11:50:49 -0500 Subject: [PATCH 6/7] Try without ensure_float --- pandas/core/frame.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8419e13f5ff7f..84cf0df99b076 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -84,7 +84,6 @@ validate_numeric_casting, ) from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_platform_int, infer_dtype_from_object, @@ -7874,13 +7873,13 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame": mat = numeric_df.astype(float).to_numpy() if method == "pearson": - correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods) + correl = libalgos.nancorr(mat, minp=min_periods) elif method == "spearman": - correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods) + correl = libalgos.nancorr_spearman(mat, minp=min_periods) elif method == "kendall" or callable(method): if min_periods is None: min_periods = 1 - mat = ensure_float64(mat).T + mat = mat.T corrf = nanops.get_corr_func(method) K = len(cols) correl = np.empty((K, K), dtype=float) @@ -8016,7 +8015,7 @@ def cov(self, min_periods=None) -> "DataFrame": base_cov = np.cov(mat.T) base_cov = base_cov.reshape((len(cols), len(cols))) else: - base_cov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods) + base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) return self._constructor(base_cov, index=idx, columns=cols) From c2c059b78b98a21e2eed55094c149ce401cc4d83 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 27 Apr 2020 16:36:16 -0500 Subject: [PATCH 7/7] Don't copy --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 84cf0df99b076..4b4801f4e8c58 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7870,7 +7870,7 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame": numeric_df = self._get_numeric_data() cols = numeric_df.columns idx = cols.copy() - mat = numeric_df.astype(float).to_numpy() + mat = numeric_df.astype(float, copy=False).to_numpy() if method == "pearson": correl = libalgos.nancorr(mat, minp=min_periods) @@ -8005,7 +8005,7 @@ def cov(self, min_periods=None) -> "DataFrame": numeric_df = self._get_numeric_data() cols = numeric_df.columns idx = cols.copy() - mat = numeric_df.astype(float).to_numpy() + mat = numeric_df.astype(float, copy=False).to_numpy() if notna(mat).all(): if min_periods is not None and min_periods > len(mat):