diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 08d20af314110..b2d3c588c3bb8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -569,7 +569,7 @@ Numeric - Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`) - Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`) - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) -- +- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5810e86f2c8b1..4b4801f4e8c58 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -84,7 +84,6 @@ validate_numeric_casting, ) from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_platform_int, infer_dtype_from_object, @@ -7871,16 +7870,16 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame": numeric_df = self._get_numeric_data() cols = numeric_df.columns idx = cols.copy() - mat = numeric_df.values + mat = numeric_df.astype(float, copy=False).to_numpy() if method == "pearson": - correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods) + correl = libalgos.nancorr(mat, minp=min_periods) elif method == "spearman": - correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods) + correl = libalgos.nancorr_spearman(mat, minp=min_periods) elif method == "kendall" or callable(method): if min_periods is None: min_periods = 1 - mat = ensure_float64(mat).T + mat = mat.T corrf = nanops.get_corr_func(method) K = len(cols) correl = np.empty((K, K), dtype=float) @@ -8006,19 +8005,19 @@ def cov(self, min_periods=None) -> "DataFrame": numeric_df = self._get_numeric_data() cols = numeric_df.columns idx = cols.copy() - mat = numeric_df.values + mat = numeric_df.astype(float, copy=False).to_numpy() if notna(mat).all(): if min_periods is not None and min_periods > len(mat): - baseCov = np.empty((mat.shape[1], mat.shape[1])) - baseCov.fill(np.nan) + base_cov = np.empty((mat.shape[1], mat.shape[1])) + base_cov.fill(np.nan) else: - baseCov = np.cov(mat.T) - baseCov = baseCov.reshape((len(cols), len(cols))) + base_cov = np.cov(mat.T) + base_cov = base_cov.reshape((len(cols), len(cols))) else: - baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods) + base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) - return self._constructor(baseCov, index=idx, columns=cols) + return self._constructor(base_cov, index=idx, columns=cols) def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series: """ diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 5c13b60aae0d0..7d75db55c3073 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -58,6 +58,17 @@ def test_cov(self, float_frame, float_string_frame): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])] + ) + def test_cov_nullable_integer(self, other_column): + # https://github.com/pandas-dev/pandas/issues/33803 + data = pd.DataFrame({"a": pd.array([1, 2, None]), "b": other_column}) + result = data.cov() + arr = np.array([[0.5, 0.5], [0.5, 1.0]]) + expected = pd.DataFrame(arr, columns=["a", "b"], index=["a", "b"]) + tm.assert_frame_equal(result, expected) + class TestDataFrameCorr: # DataFrame.corr(), as opposed to DataFrame.corrwith @@ -153,6 +164,22 @@ def test_corr_int(self): df3.cov() df3.corr() + @td.skip_if_no_scipy + @pytest.mark.parametrize( + "nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])] + ) + @pytest.mark.parametrize( + "other_column", + [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])], + ) + @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) + def test_corr_nullable_integer(self, nullable_column, other_column, method): + # https://github.com/pandas-dev/pandas/issues/33803 + data = pd.DataFrame({"a": nullable_column, "b": other_column}) + result = data.corr(method=method) + expected = pd.DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) + tm.assert_frame_equal(result, expected) + class TestDataFrameCorrWith: def test_corrwith(self, datetime_frame):