Skip to content

ENH: Add numeric_only to frame methods #46708

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ Other enhancements
- :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
- :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
- :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
-
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.notable_bug_fixes:
Expand Down
70 changes: 55 additions & 15 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9764,6 +9764,7 @@ def corr(
self,
method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
min_periods: int = 1,
numeric_only: bool = True,
) -> DataFrame:
"""
Compute pairwise correlation of columns, excluding NA/null values.
Expand All @@ -9784,6 +9785,10 @@ def corr(
Minimum number of observations required per pair of columns
to have a valid result. Currently only available for Pearson
and Spearman correlation.
numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Expand Down Expand Up @@ -9823,10 +9828,13 @@ def corr(
dogs 1.0 NaN
cats NaN 1.0
""" # noqa:E501
numeric_df = self._get_numeric_data()
cols = numeric_df.columns
if numeric_only:
data = self._get_numeric_data()
else:
data = self
cols = data.columns
idx = cols.copy()
mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False)
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

if method == "pearson":
correl = libalgos.nancorr(mat, minp=min_periods)
Expand Down Expand Up @@ -9865,7 +9873,12 @@ def corr(

return self._constructor(correl, index=idx, columns=cols)

def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame:
def cov(
self,
min_periods: int | None = None,
ddof: int | None = 1,
numeric_only: bool = True,
) -> DataFrame:
"""
Compute pairwise covariance of columns, excluding NA/null values.

Expand Down Expand Up @@ -9896,6 +9909,11 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame

.. versionadded:: 1.1.0

numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
DataFrame
Expand Down Expand Up @@ -9964,10 +9982,13 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame
b NaN 1.248003 0.191417
c -0.150812 0.191417 0.895202
"""
numeric_df = self._get_numeric_data()
cols = numeric_df.columns
if numeric_only:
data = self._get_numeric_data()
else:
data = self
cols = data.columns
idx = cols.copy()
mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False)
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

if notna(mat).all():
if min_periods is not None and min_periods > len(mat):
Expand All @@ -9981,7 +10002,14 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame

return self._constructor(base_cov, index=idx, columns=cols)

def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Series:
def corrwith(
self,
other,
axis: Axis = 0,
drop=False,
method="pearson",
numeric_only: bool = True,
) -> Series:
"""
Compute pairwise correlation.

Expand All @@ -10008,6 +10036,11 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
* callable: callable with input two 1d ndarrays
and returning a float.

numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Series
Expand Down Expand Up @@ -10039,7 +10072,10 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
dtype: float64
""" # noqa:E501
axis = self._get_axis_number(axis)
this = self._get_numeric_data()
if numeric_only:
this = self._get_numeric_data()
else:
this = self

# GH46174: when other is a Series object and axis=0, we achieve a speedup over
# passing .corr() to .apply() by taking the columns as ndarrays and iterating
Expand All @@ -10052,19 +10088,23 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie
if isinstance(other, Series):
if axis == 0 and method in ["pearson", "spearman"]:
corrs = {}
numeric_cols = self.select_dtypes(include=np.number).columns
ndf = self[numeric_cols].values.transpose()
if numeric_only:
cols = self.select_dtypes(include=np.number).columns
ndf = self[cols].values.transpose()
else:
cols = self.columns
ndf = self.values.transpose()
k = other.values
if method == "pearson":
for i, r in enumerate(ndf):
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
corrs[numeric_cols[i]] = np.corrcoef(
r[nonnull_mask], k[nonnull_mask]
)[0, 1]
corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[
0, 1
]
else:
for i, r in enumerate(ndf):
nonnull_mask = ~np.isnan(r) & ~np.isnan(k)
corrs[numeric_cols[i]] = np.corrcoef(
corrs[cols[i]] = np.corrcoef(
r[nonnull_mask].argsort().argsort(),
k[nonnull_mask].argsort().argsort(),
)[0, 1]
Expand Down
45 changes: 40 additions & 5 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,20 @@ def test_cov_nullable_integer(self, other_column):
expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"])
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("numeric_only", [True, False])
def test_cov_numeric_only(self, numeric_only):
# when dtypes of pandas series are different
# then ndarray will have dtype=object,
# so it need to be properly handled
df = DataFrame({"a": [1, 0], "c": ["x", "y"]})
expected = DataFrame(0.5, index=["a"], columns=["a"])
if numeric_only:
result = df.cov(numeric_only=numeric_only)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(ValueError, match="could not convert string to float"):
df.cov(numeric_only=numeric_only)


class TestDataFrameCorr:
# DataFrame.corr(), as opposed to DataFrame.corrwith
Expand Down Expand Up @@ -235,6 +249,22 @@ def test_corr_min_periods_greater_than_length(self, method):
)
tm.assert_frame_equal(result, expected)

@td.skip_if_no_scipy
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
@pytest.mark.parametrize("numeric_only", [True, False])
def test_corr_numeric_only(self, meth, numeric_only):
# when dtypes of pandas series are different
# then ndarray will have dtype=object,
# so it need to be properly handled
df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]})
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
if numeric_only:
result = df.corr(meth, numeric_only=numeric_only)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(ValueError, match="could not convert string to float"):
df.corr(meth, numeric_only=numeric_only)


class TestDataFrameCorrWith:
def test_corrwith(self, datetime_frame):
Expand Down Expand Up @@ -300,16 +330,21 @@ def test_corrwith_matches_corrcoef(self):
tm.assert_almost_equal(c1, c2)
assert c1 < 1

def test_corrwith_mixed_dtypes(self):
@pytest.mark.parametrize("numeric_only", [True, False])
def test_corrwith_mixed_dtypes(self, numeric_only):
# GH#18570
df = DataFrame(
{"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
)
s = Series([0, 6, 7, 3])
result = df.corrwith(s)
corrs = [df["a"].corr(s), df["b"].corr(s)]
expected = Series(data=corrs, index=["a", "b"])
tm.assert_series_equal(result, expected)
if numeric_only:
result = df.corrwith(s, numeric_only=numeric_only)
corrs = [df["a"].corr(s), df["b"].corr(s)]
expected = Series(data=corrs, index=["a", "b"])
tm.assert_series_equal(result, expected)
else:
with pytest.raises(TypeError, match="not supported for the input types"):
df.corrwith(s, numeric_only=numeric_only)

def test_corrwith_index_intersection(self):
df1 = DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"])
Expand Down