Skip to content

Commit 860797b

Browse files
authored
ENH: Add numeric_only to frame methods (#46708)
1 parent 8172879 commit 860797b

File tree

3 files changed

+96
-21
lines changed

3 files changed

+96
-21
lines changed

doc/source/whatsnew/v1.5.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ Other enhancements
9494
- :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
9595
- :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
9696
- :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
97-
-
97+
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
9898

9999
.. ---------------------------------------------------------------------------
100100
.. _whatsnew_150.notable_bug_fixes:

pandas/core/frame.py

+55-15
Original file line numberDiff line numberDiff line change
@@ -9764,6 +9764,7 @@ def corr(
97649764
self,
97659765
method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
97669766
min_periods: int = 1,
9767+
numeric_only: bool = True,
97679768
) -> DataFrame:
97689769
"""
97699770
Compute pairwise correlation of columns, excluding NA/null values.
@@ -9784,6 +9785,10 @@ def corr(
97849785
Minimum number of observations required per pair of columns
97859786
to have a valid result. Currently only available for Pearson
97869787
and Spearman correlation.
9788+
numeric_only : bool, default True
9789+
Include only `float`, `int` or `boolean` data.
9790+
9791+
.. versionadded:: 1.5.0
97879792
97889793
Returns
97899794
-------
@@ -9823,10 +9828,13 @@ def corr(
98239828
dogs 1.0 NaN
98249829
cats NaN 1.0
98259830
""" # noqa:E501
9826-
numeric_df = self._get_numeric_data()
9827-
cols = numeric_df.columns
9831+
if numeric_only:
9832+
data = self._get_numeric_data()
9833+
else:
9834+
data = self
9835+
cols = data.columns
98289836
idx = cols.copy()
9829-
mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False)
9837+
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
98309838

98319839
if method == "pearson":
98329840
correl = libalgos.nancorr(mat, minp=min_periods)
@@ -9865,7 +9873,12 @@ def corr(
98659873

98669874
return self._constructor(correl, index=idx, columns=cols)
98679875

9868-
def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame:
9876+
def cov(
9877+
self,
9878+
min_periods: int | None = None,
9879+
ddof: int | None = 1,
9880+
numeric_only: bool = True,
9881+
) -> DataFrame:
98699882
"""
98709883
Compute pairwise covariance of columns, excluding NA/null values.
98719884
@@ -9896,6 +9909,11 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame
98969909
98979910
.. versionadded:: 1.1.0
98989911
9912+
numeric_only : bool, default True
9913+
Include only `float`, `int` or `boolean` data.
9914+
9915+
.. versionadded:: 1.5.0
9916+
98999917
Returns
99009918
-------
99019919
DataFrame
@@ -9964,10 +9982,13 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame
99649982
b NaN 1.248003 0.191417
99659983
c -0.150812 0.191417 0.895202
99669984
"""
9967-
numeric_df = self._get_numeric_data()
9968-
cols = numeric_df.columns
9985+
if numeric_only:
9986+
data = self._get_numeric_data()
9987+
else:
9988+
data = self
9989+
cols = data.columns
99699990
idx = cols.copy()
9970-
mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False)
9991+
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
99719992

99729993
if notna(mat).all():
99739994
if min_periods is not None and min_periods > len(mat):
@@ -9981,7 +10002,14 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame
998110002

998210003
return self._constructor(base_cov, index=idx, columns=cols)
998310004

9984-
def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Series:
10005+
def corrwith(
10006+
self,
10007+
other,
10008+
axis: Axis = 0,
10009+
drop=False,
10010+
method="pearson",
10011+
numeric_only: bool = True,
10012+
) -> Series:
998510013
"""
998610014
Compute pairwise correlation.
998710015
@@ -10008,6 +10036,11 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
1000810036
* callable: callable with input two 1d ndarrays
1000910037
and returning a float.
1001010038
10039+
numeric_only : bool, default True
10040+
Include only `float`, `int` or `boolean` data.
10041+
10042+
.. versionadded:: 1.5.0
10043+
1001110044
Returns
1001210045
-------
1001310046
Series
@@ -10039,7 +10072,10 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
1003910072
dtype: float64
1004010073
""" # noqa:E501
1004110074
axis = self._get_axis_number(axis)
10042-
this = self._get_numeric_data()
10075+
if numeric_only:
10076+
this = self._get_numeric_data()
10077+
else:
10078+
this = self
1004310079

1004410080
# GH46174: when other is a Series object and axis=0, we achieve a speedup over
1004510081
# passing .corr() to .apply() by taking the columns as ndarrays and iterating
@@ -10052,19 +10088,23 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
1005210088
if isinstance(other, Series):
1005310089
if axis == 0 and method in ["pearson", "spearman"]:
1005410090
corrs = {}
10055-
numeric_cols = self.select_dtypes(include=np.number).columns
10056-
ndf = self[numeric_cols].values.transpose()
10091+
if numeric_only:
10092+
cols = self.select_dtypes(include=np.number).columns
10093+
ndf = self[cols].values.transpose()
10094+
else:
10095+
cols = self.columns
10096+
ndf = self.values.transpose()
1005710097
k = other.values
1005810098
if method == "pearson":
1005910099
for i, r in enumerate(ndf):
1006010100
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
10061-
corrs[numeric_cols[i]] = np.corrcoef(
10062-
r[nonnull_mask], k[nonnull_mask]
10063-
)[0, 1]
10101+
corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[
10102+
0, 1
10103+
]
1006410104
else:
1006510105
for i, r in enumerate(ndf):
1006610106
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
10067-
corrs[numeric_cols[i]] = np.corrcoef(
10107+
corrs[cols[i]] = np.corrcoef(
1006810108
r[nonnull_mask].argsort().argsort(),
1006910109
k[nonnull_mask].argsort().argsort(),
1007010110
)[0, 1]

pandas/tests/frame/methods/test_cov_corr.py

+40-5
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,20 @@ def test_cov_nullable_integer(self, other_column):
8383
expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"])
8484
tm.assert_frame_equal(result, expected)
8585

86+
@pytest.mark.parametrize("numeric_only", [True, False])
87+
def test_cov_numeric_only(self, numeric_only):
88+
# when dtypes of pandas series are different
89+
# then ndarray will have dtype=object,
90+
# so it need to be properly handled
91+
df = DataFrame({"a": [1, 0], "c": ["x", "y"]})
92+
expected = DataFrame(0.5, index=["a"], columns=["a"])
93+
if numeric_only:
94+
result = df.cov(numeric_only=numeric_only)
95+
tm.assert_frame_equal(result, expected)
96+
else:
97+
with pytest.raises(ValueError, match="could not convert string to float"):
98+
df.cov(numeric_only=numeric_only)
99+
86100

87101
class TestDataFrameCorr:
88102
# DataFrame.corr(), as opposed to DataFrame.corrwith
@@ -235,6 +249,22 @@ def test_corr_min_periods_greater_than_length(self, method):
235249
)
236250
tm.assert_frame_equal(result, expected)
237251

252+
@td.skip_if_no_scipy
253+
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
254+
@pytest.mark.parametrize("numeric_only", [True, False])
255+
def test_corr_numeric_only(self, meth, numeric_only):
256+
# when dtypes of pandas series are different
257+
# then ndarray will have dtype=object,
258+
# so it need to be properly handled
259+
df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]})
260+
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
261+
if numeric_only:
262+
result = df.corr(meth, numeric_only=numeric_only)
263+
tm.assert_frame_equal(result, expected)
264+
else:
265+
with pytest.raises(ValueError, match="could not convert string to float"):
266+
df.corr(meth, numeric_only=numeric_only)
267+
238268

239269
class TestDataFrameCorrWith:
240270
def test_corrwith(self, datetime_frame):
@@ -300,16 +330,21 @@ def test_corrwith_matches_corrcoef(self):
300330
tm.assert_almost_equal(c1, c2)
301331
assert c1 < 1
302332

303-
def test_corrwith_mixed_dtypes(self):
333+
@pytest.mark.parametrize("numeric_only", [True, False])
334+
def test_corrwith_mixed_dtypes(self, numeric_only):
304335
# GH#18570
305336
df = DataFrame(
306337
{"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
307338
)
308339
s = Series([0, 6, 7, 3])
309-
result = df.corrwith(s)
310-
corrs = [df["a"].corr(s), df["b"].corr(s)]
311-
expected = Series(data=corrs, index=["a", "b"])
312-
tm.assert_series_equal(result, expected)
340+
if numeric_only:
341+
result = df.corrwith(s, numeric_only=numeric_only)
342+
corrs = [df["a"].corr(s), df["b"].corr(s)]
343+
expected = Series(data=corrs, index=["a", "b"])
344+
tm.assert_series_equal(result, expected)
345+
else:
346+
with pytest.raises(TypeError, match="not supported for the input types"):
347+
df.corrwith(s, numeric_only=numeric_only)
313348

314349
def test_corrwith_index_intersection(self):
315350
df1 = DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"])

0 commit comments

Comments
 (0)