Skip to content

Commit 8aa7072

Browse files
authored
BUG: Don't raise in DataFrame.corr with pd.NA (#33809)
1 parent 9bd296c commit 8aa7072

File tree

3 files changed

+39
-13
lines changed

3 files changed

+39
-13
lines changed

doc/source/whatsnew/v1.1.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -569,7 +569,7 @@ Numeric
569569
- Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`)
570570
- Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`)
571571
- Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
572-
-
572+
- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
573573

574574
Conversion
575575
^^^^^^^^^^

pandas/core/frame.py

+11-12
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@
8484
validate_numeric_casting,
8585
)
8686
from pandas.core.dtypes.common import (
87-
ensure_float64,
8887
ensure_int64,
8988
ensure_platform_int,
9089
infer_dtype_from_object,
@@ -7871,16 +7870,16 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame":
78717870
numeric_df = self._get_numeric_data()
78727871
cols = numeric_df.columns
78737872
idx = cols.copy()
7874-
mat = numeric_df.values
7873+
mat = numeric_df.astype(float, copy=False).to_numpy()
78757874

78767875
if method == "pearson":
7877-
correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods)
7876+
correl = libalgos.nancorr(mat, minp=min_periods)
78787877
elif method == "spearman":
7879-
correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods)
7878+
correl = libalgos.nancorr_spearman(mat, minp=min_periods)
78807879
elif method == "kendall" or callable(method):
78817880
if min_periods is None:
78827881
min_periods = 1
7883-
mat = ensure_float64(mat).T
7882+
mat = mat.T
78847883
corrf = nanops.get_corr_func(method)
78857884
K = len(cols)
78867885
correl = np.empty((K, K), dtype=float)
@@ -8006,19 +8005,19 @@ def cov(self, min_periods=None) -> "DataFrame":
80068005
numeric_df = self._get_numeric_data()
80078006
cols = numeric_df.columns
80088007
idx = cols.copy()
8009-
mat = numeric_df.values
8008+
mat = numeric_df.astype(float, copy=False).to_numpy()
80108009

80118010
if notna(mat).all():
80128011
if min_periods is not None and min_periods > len(mat):
8013-
baseCov = np.empty((mat.shape[1], mat.shape[1]))
8014-
baseCov.fill(np.nan)
8012+
base_cov = np.empty((mat.shape[1], mat.shape[1]))
8013+
base_cov.fill(np.nan)
80158014
else:
8016-
baseCov = np.cov(mat.T)
8017-
baseCov = baseCov.reshape((len(cols), len(cols)))
8015+
base_cov = np.cov(mat.T)
8016+
base_cov = base_cov.reshape((len(cols), len(cols)))
80188017
else:
8019-
baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods)
8018+
base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)
80208019

8021-
return self._constructor(baseCov, index=idx, columns=cols)
8020+
return self._constructor(base_cov, index=idx, columns=cols)
80228021

80238022
def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series:
80248023
"""

pandas/tests/frame/methods/test_cov_corr.py

+27
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,17 @@ def test_cov(self, float_frame, float_string_frame):
5858
)
5959
tm.assert_frame_equal(result, expected)
6060

61+
@pytest.mark.parametrize(
62+
"other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])]
63+
)
64+
def test_cov_nullable_integer(self, other_column):
65+
# https://github.com/pandas-dev/pandas/issues/33803
66+
data = pd.DataFrame({"a": pd.array([1, 2, None]), "b": other_column})
67+
result = data.cov()
68+
arr = np.array([[0.5, 0.5], [0.5, 1.0]])
69+
expected = pd.DataFrame(arr, columns=["a", "b"], index=["a", "b"])
70+
tm.assert_frame_equal(result, expected)
71+
6172

6273
class TestDataFrameCorr:
6374
# DataFrame.corr(), as opposed to DataFrame.corrwith
@@ -153,6 +164,22 @@ def test_corr_int(self):
153164
df3.cov()
154165
df3.corr()
155166

167+
@td.skip_if_no_scipy
168+
@pytest.mark.parametrize(
169+
"nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])]
170+
)
171+
@pytest.mark.parametrize(
172+
"other_column",
173+
[pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])],
174+
)
175+
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
176+
def test_corr_nullable_integer(self, nullable_column, other_column, method):
177+
# https://github.com/pandas-dev/pandas/issues/33803
178+
data = pd.DataFrame({"a": nullable_column, "b": other_column})
179+
result = data.corr(method=method)
180+
expected = pd.DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
181+
tm.assert_frame_equal(result, expected)
182+
156183

157184
class TestDataFrameCorrWith:
158185
def test_corrwith(self, datetime_frame):

0 commit comments

Comments
 (0)