Skip to content

Commit 11462d6

Browse files
authored
DEPR: numeric_only default in DataFrame methods with None/True (#46906)
1 parent d5cf2b8 commit 11462d6

File tree

7 files changed

+237
-43
lines changed

7 files changed

+237
-43
lines changed

doc/source/whatsnew/v1.5.0.rst

+43-1
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ Other enhancements
120120
- :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
121121
- :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
122122
- :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
123-
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.GroupBy.quantile` (:issue:`46560`)
123+
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.DataFrameGroupBy.quantile` (:issue:`46560`)
124124
- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`)
125125
- Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`)
126126
- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
@@ -463,6 +463,48 @@ As ``group_keys=True`` is the default value of :meth:`DataFrame.groupby` and
463463
raise a ``FutureWarning``. This can be silenced and the previous behavior
464464
retained by specifying ``group_keys=False``.
465465

466+
.. _whatsnew_150.deprecations.numeric_only_default:
467+
468+
``numeric_only`` default value
469+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
470+
471+
Across the DataFrame operations such as ``min``, ``sum``, and ``idxmax``, the default
472+
value of the ``numeric_only`` argument, if it exists at all, was inconsistent.
473+
Furthermore, operations with the default value ``None`` can lead to surprising
474+
results. (:issue:`46560`)
475+
476+
.. code-block:: ipython
477+
478+
In [1]: df = pd.DataFrame({"a": [1, 2], "b": ["x", "y"]})
479+
480+
In [2]: # Reading the next line without knowing the contents of df, one would
481+
# expect the result to contain the products for both columns a and b.
482+
df[["a", "b"]].prod()
483+
Out[2]:
484+
a 2
485+
dtype: int64
486+
487+
To avoid this behavior, the specifying the value ``numeric_only=None`` has been
488+
deprecated, and will be removed in a future version of pandas. In the future,
489+
all operations with a ``numeric_only`` argument will default to ``False``. Users
490+
should either call the operation only with columns that can be operated on, or
491+
specify ``numeric_only=True`` to operate only on Boolean, integer, and float columns.
492+
493+
In order to support the transition to the new behavior, the following methods have
494+
gained the ``numeric_only`` argument.
495+
496+
- :meth:`DataFrame.corr`
497+
- :meth:`DataFrame.corrwith`
498+
- :meth:`DataFrame.cov`
499+
- :meth:`DataFrame.idxmin`
500+
- :meth:`DataFrame.idxmax`
501+
- :meth:`.DataFrameGroupBy.idxmin`
502+
- :meth:`.DataFrameGroupBy.idxmax`
503+
- :meth:`.GroupBy.var`
504+
- :meth:`.GroupBy.std`
505+
- :meth:`.GroupBy.sem`
506+
- :meth:`.DataFrameGroupBy.quantile`
507+
466508
.. _whatsnew_150.deprecations.other:
467509

468510
Other Deprecations

pandas/core/common.py

+60
Original file line numberDiff line numberDiff line change
@@ -635,3 +635,63 @@ def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]:
635635
list of column names with the None values replaced.
636636
"""
637637
return [f"level_{i}" if name is None else name for i, name in enumerate(names)]
638+
639+
640+
def resolve_numeric_only(numeric_only: bool | None | lib.NoDefault) -> bool:
641+
"""Determine the Boolean value of numeric_only.
642+
643+
See GH#46560 for details on the deprecation.
644+
645+
Parameters
646+
----------
647+
numeric_only : bool, None, or lib.no_default
648+
Value passed to the method.
649+
650+
Returns
651+
-------
652+
Resolved value of numeric_only.
653+
"""
654+
if numeric_only is lib.no_default:
655+
# Methods that behave like numeric_only=True and only got the numeric_only
656+
# arg in 1.5.0 default to lib.no_default
657+
result = True
658+
elif numeric_only is None:
659+
# Methods that had the numeric_only arg prior to 1.5.0 and try all columns
660+
# first default to None
661+
result = False
662+
else:
663+
result = cast(bool, numeric_only)
664+
return result
665+
666+
667+
def deprecate_numeric_only_default(cls: type, name: str, deprecate_none: bool = False):
668+
"""Emit FutureWarning message for deprecation of numeric_only.
669+
670+
See GH#46560 for details on the deprecation.
671+
672+
Parameters
673+
----------
674+
cls : type
675+
pandas type that is generating the warning.
676+
name : str
677+
Name of the method that is generating the warning.
678+
deprecate_none : bool, default False
679+
Whether to also warn about the deprecation of specifying ``numeric_only=None``.
680+
"""
681+
if name in ["all", "any"]:
682+
arg_name = "bool_only"
683+
else:
684+
arg_name = "numeric_only"
685+
686+
msg = (
687+
f"The default value of {arg_name} in {cls.__name__}.{name} is "
688+
"deprecated. In a future version, it will default to False. "
689+
)
690+
if deprecate_none:
691+
msg += f"In addition, specifying '{arg_name}=None' is deprecated. "
692+
msg += (
693+
f"Select only valid columns or specify the value of {arg_name} to silence "
694+
"this warning."
695+
)
696+
697+
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())

pandas/core/frame.py

+40-36
Original file line numberDiff line numberDiff line change
@@ -9833,7 +9833,7 @@ def corr(
98339833
self,
98349834
method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
98359835
min_periods: int = 1,
9836-
numeric_only: bool = True,
9836+
numeric_only: bool | lib.NoDefault = lib.no_default,
98379837
) -> DataFrame:
98389838
"""
98399839
Compute pairwise correlation of columns, excluding NA/null values.
@@ -9859,6 +9859,10 @@ def corr(
98599859
98609860
.. versionadded:: 1.5.0
98619861
9862+
.. deprecated:: 1.5.0
9863+
The default value of ``numeric_only`` will be ``False`` in a future
9864+
version of pandas.
9865+
98629866
Returns
98639867
-------
98649868
DataFrame
@@ -9897,10 +9901,11 @@ def corr(
98979901
dogs 1.0 NaN
98989902
cats NaN 1.0
98999903
""" # noqa:E501
9900-
if numeric_only:
9901-
data = self._get_numeric_data()
9902-
else:
9903-
data = self
9904+
numeric_only_bool = com.resolve_numeric_only(numeric_only)
9905+
data = self._get_numeric_data() if numeric_only_bool else self
9906+
if numeric_only is lib.no_default and len(data.columns) < len(self.columns):
9907+
com.deprecate_numeric_only_default(type(self), "corr")
9908+
99049909
cols = data.columns
99059910
idx = cols.copy()
99069911
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
@@ -9946,7 +9951,7 @@ def cov(
99469951
self,
99479952
min_periods: int | None = None,
99489953
ddof: int | None = 1,
9949-
numeric_only: bool = True,
9954+
numeric_only: bool | lib.NoDefault = lib.no_default,
99509955
) -> DataFrame:
99519956
"""
99529957
Compute pairwise covariance of columns, excluding NA/null values.
@@ -9983,6 +9988,10 @@ def cov(
99839988
99849989
.. versionadded:: 1.5.0
99859990
9991+
.. deprecated:: 1.5.0
9992+
The default value of ``numeric_only`` will be ``False`` in a future
9993+
version of pandas.
9994+
99869995
Returns
99879996
-------
99889997
DataFrame
@@ -10051,10 +10060,11 @@ def cov(
1005110060
b NaN 1.248003 0.191417
1005210061
c -0.150812 0.191417 0.895202
1005310062
"""
10054-
if numeric_only:
10055-
data = self._get_numeric_data()
10056-
else:
10057-
data = self
10063+
numeric_only_bool = com.resolve_numeric_only(numeric_only)
10064+
data = self._get_numeric_data() if numeric_only_bool else self
10065+
if numeric_only is lib.no_default and len(data.columns) < len(self.columns):
10066+
com.deprecate_numeric_only_default(type(self), "cov")
10067+
1005810068
cols = data.columns
1005910069
idx = cols.copy()
1006010070
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
@@ -10077,7 +10087,7 @@ def corrwith(
1007710087
axis: Axis = 0,
1007810088
drop=False,
1007910089
method="pearson",
10080-
numeric_only: bool = True,
10090+
numeric_only: bool | lib.NoDefault = lib.no_default,
1008110091
) -> Series:
1008210092
"""
1008310093
Compute pairwise correlation.
@@ -10110,6 +10120,10 @@ def corrwith(
1011010120
1011110121
.. versionadded:: 1.5.0
1011210122
10123+
.. deprecated:: 1.5.0
10124+
The default value of ``numeric_only`` will be ``False`` in a future
10125+
version of pandas.
10126+
1011310127
Returns
1011410128
-------
1011510129
Series
@@ -10141,10 +10155,10 @@ def corrwith(
1014110155
dtype: float64
1014210156
""" # noqa:E501
1014310157
axis = self._get_axis_number(axis)
10144-
if numeric_only:
10145-
this = self._get_numeric_data()
10146-
else:
10147-
this = self
10158+
numeric_only_bool = com.resolve_numeric_only(numeric_only)
10159+
this = self._get_numeric_data() if numeric_only_bool else self
10160+
if numeric_only is lib.no_default and len(this.columns) < len(self.columns):
10161+
com.deprecate_numeric_only_default(type(self), "corrwith")
1014810162

1014910163
# GH46174: when other is a Series object and axis=0, we achieve a speedup over
1015010164
# passing .corr() to .apply() by taking the columns as ndarrays and iterating
@@ -10396,7 +10410,6 @@ def _reduce(
1039610410
filter_type=None,
1039710411
**kwds,
1039810412
):
10399-
1040010413
assert filter_type is None or filter_type == "bool", filter_type
1040110414
out_dtype = "bool" if filter_type == "bool" else None
1040210415

@@ -10451,14 +10464,15 @@ def _get_data() -> DataFrame:
1045110464
data = self._get_bool_data()
1045210465
return data
1045310466

10467+
numeric_only_bool = com.resolve_numeric_only(numeric_only)
1045410468
if numeric_only is not None or axis == 0:
1045510469
# For numeric_only non-None and axis non-None, we know
1045610470
# which blocks to use and no try/except is needed.
1045710471
# For numeric_only=None only the case with axis==0 and no object
1045810472
# dtypes are unambiguous can be handled with BlockManager.reduce
1045910473
# Case with EAs see GH#35881
1046010474
df = self
10461-
if numeric_only is True:
10475+
if numeric_only_bool:
1046210476
df = _get_data()
1046310477
if axis == 1:
1046410478
df = df.T
@@ -10479,16 +10493,8 @@ def _get_data() -> DataFrame:
1047910493

1048010494
if numeric_only is None and out.shape[0] != df.shape[1]:
1048110495
# columns have been dropped GH#41480
10482-
arg_name = "numeric_only"
10483-
if name in ["all", "any"]:
10484-
arg_name = "bool_only"
10485-
warnings.warn(
10486-
"Dropping of nuisance columns in DataFrame reductions "
10487-
f"(with '{arg_name}=None') is deprecated; in a future "
10488-
"version this will raise TypeError. Select only valid "
10489-
"columns before calling the reduction.",
10490-
FutureWarning,
10491-
stacklevel=find_stack_level(),
10496+
com.deprecate_numeric_only_default(
10497+
type(self), name, deprecate_none=True
1049210498
)
1049310499

1049410500
return out
@@ -10776,6 +10782,11 @@ def quantile(
1077610782
numeric_only : bool, default True
1077710783
If False, the quantile of datetime and timedelta data will be
1077810784
computed as well.
10785+
10786+
.. deprecated:: 1.5.0
10787+
The default value of ``numeric_only`` will be ``False`` in a future
10788+
version of pandas.
10789+
1077910790
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
1078010791
This optional parameter specifies the interpolation method to use,
1078110792
when the desired quantile lies between two data points `i` and `j`:
@@ -10833,15 +10844,8 @@ def quantile(
1083310844
axis = self._get_axis_number(axis)
1083410845
any_not_numeric = any(not is_numeric_dtype(x) for x in self.dtypes)
1083510846
if numeric_only is no_default and any_not_numeric:
10836-
warnings.warn(
10837-
"In future versions of pandas, numeric_only will be set to "
10838-
"False by default, and the datetime/timedelta columns will "
10839-
"be considered in the results. To not consider these columns"
10840-
"specify numeric_only=True.",
10841-
FutureWarning,
10842-
stacklevel=find_stack_level(),
10843-
)
10844-
numeric_only = True
10847+
com.deprecate_numeric_only_default(type(self), "quantile")
10848+
numeric_only = com.resolve_numeric_only(numeric_only)
1084510849

1084610850
if not is_list_like(q):
1084710851
# BlockManager.quantile expects listlike, so we wrap and unwrap here

pandas/core/generic.py

+9
Original file line numberDiff line numberDiff line change
@@ -11558,6 +11558,11 @@ def _doc_params(cls):
1155811558
numeric_only : bool, default None
1155911559
Include only float, int, boolean columns. If None, will attempt to use
1156011560
everything, then use only numeric data. Not implemented for Series.
11561+
11562+
.. deprecated:: 1.5.0
11563+
Specifying ``numeric_only=None`` is deprecated. The default value will be
11564+
``False`` in a future version of pandas.
11565+
1156111566
{min_count}\
1156211567
**kwargs
1156311568
Additional keyword arguments to be passed to the function.
@@ -11588,6 +11593,10 @@ def _doc_params(cls):
1158811593
Include only float, int, boolean columns. If None, will attempt to use
1158911594
everything, then use only numeric data. Not implemented for Series.
1159011595
11596+
.. deprecated:: 1.5.0
11597+
Specifying ``numeric_only=None`` is deprecated. The default value will be
11598+
``False`` in a future version of pandas.
11599+
1159111600
Returns
1159211601
-------
1159311602
{name1} or {name2} (if level specified) \

pandas/tests/frame/methods/test_cov_corr.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@ def test_cov(self, float_frame, float_string_frame):
4141
tm.assert_almost_equal(result["A"]["C"], expected)
4242

4343
# exclude non-numeric types
44-
result = float_string_frame.cov()
44+
with tm.assert_produces_warning(
45+
FutureWarning, match="The default value of numeric_only"
46+
):
47+
result = float_string_frame.cov()
4548
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov()
4649
tm.assert_frame_equal(result, expected)
4750

@@ -116,7 +119,10 @@ def test_corr_scipy_method(self, float_frame, method):
116119

117120
def test_corr_non_numeric(self, float_string_frame):
118121
# exclude non-numeric types
119-
result = float_string_frame.corr()
122+
with tm.assert_produces_warning(
123+
FutureWarning, match="The default value of numeric_only"
124+
):
125+
result = float_string_frame.corr()
120126
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr()
121127
tm.assert_frame_equal(result, expected)
122128

@@ -307,11 +313,17 @@ def test_corrwith_with_objects(self):
307313
df1["obj"] = "foo"
308314
df2["obj"] = "bar"
309315

310-
result = df1.corrwith(df2)
316+
with tm.assert_produces_warning(
317+
FutureWarning, match="The default value of numeric_only"
318+
):
319+
result = df1.corrwith(df2)
311320
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
312321
tm.assert_series_equal(result, expected)
313322

314-
result = df1.corrwith(df2, axis=1)
323+
with tm.assert_produces_warning(
324+
FutureWarning, match="The default value of numeric_only"
325+
):
326+
result = df1.corrwith(df2, axis=1)
315327
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
316328
tm.assert_series_equal(result, expected)
317329

0 commit comments

Comments
 (0)