From 91c8128c328576b7f3b2399d9e7e008ea866b022 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 15 Apr 2022 17:04:56 -0400 Subject: [PATCH 1/3] DEPR: numeric_only default in DataFrame methods with None/True --- doc/source/whatsnew/v1.5.0.rst | 49 +++++++++++-- pandas/core/common.py | 60 ++++++++++++++++ pandas/core/frame.py | 76 +++++++++++---------- pandas/core/generic.py | 9 +++ pandas/tests/frame/methods/test_cov_corr.py | 20 ++++-- pandas/tests/frame/test_reductions.py | 63 ++++++++++++++++- pandas/tests/resample/test_resample_api.py | 8 ++- 7 files changed, 237 insertions(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 931d18dc349f3..8984c0895269f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -120,7 +120,7 @@ Other enhancements - :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`) - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`) - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`) -- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.GroupBy.quantile` (:issue:`46560`) +- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.DataFrameGroupBy.quantile` (:issue:`46560`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`) - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) @@ -194,11 +194,6 @@ did not have the same index as the input. df.groupby('a', dropna=True).transform('ffill') df.groupby('a', dropna=True).transform(lambda x: x) -.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2: - -notable_bug_fix2 -^^^^^^^^^^^^^^^^ - .. --------------------------------------------------------------------------- .. _whatsnew_150.api_breaking: @@ -426,6 +421,48 @@ As ``group_keys=True`` is the default value of :meth:`DataFrame.groupby` and raise a ``FutureWarning``. This can be silenced and the previous behavior retained by specifying ``group_keys=False``. +.. _whatsnew_150.deprecations.numeric_only_default: + +``numeric_only`` default value +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Across the DataFrame operations such as ``min``, ``sum``, and ``idxmax``, the default +value of the ``numeric_only`` argument, if it exists at all, was inconsistent. +Furthermore, operations with the default value ``None`` can lead to surprising +results. (:issue:`46560`) + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({"a": [1, 2], "b": ["x", "y"]}) + + In [2]: # Reading the next line without knowing the contents of df, one would + # expect the result to contain the products for both columns a and b. + df[["a", "b"]].prod() + Out[2]: + a 2 + dtype: int64 + +To avoid this behavior, the specifying the value ``numeric_only=None`` has been +deprecated, and will be removed in a future version of pandas. In the future, +all operations with a ``numeric_only`` argument will default to ``False``. Users +should either call the operation only with columns that can be operated on, or +specify ``numeric_only=True`` to operate only on Boolean, integer, and float columns. + +In order to support the transition to the new behavior, the following methods have +gained the ``numeric_only`` argument. + +- :meth:`DataFrame.corr` +- :meth:`DataFrame.corrwith` +- :meth:`DataFrame.cov` +- :meth:`DataFrame.idxmin` +- :meth:`DataFrame.idxmax` +- :meth:`.DataFrameGroupBy.idxmin` +- :meth:`.DataFrameGroupBy.idxmax` +- :meth:`.GroupBy.var` +- :meth:`.GroupBy.std` +- :meth:`.GroupBy.sem` +- :meth:`.DataFrameGroupBy.quantile` + .. _whatsnew_150.deprecations.other: Other Deprecations diff --git a/pandas/core/common.py b/pandas/core/common.py index 90f665362ef56..5e03f82a2b667 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -635,3 +635,63 @@ def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]: list of column names with the None values replaced. """ return [f"level_{i}" if name is None else name for i, name in enumerate(names)] + + +def resolve_numeric_only(numeric_only: bool | None | lib.NoDefault) -> bool: + """Determine the Boolean value of numeric_only. + + See GH#46560 for details on the deprecation. + + Parameters + ---------- + numeric_only : bool, None, or lib.no_default + Value passed to the method. + + Returns + ------- + Resolved value of numeric_only. + """ + if numeric_only is lib.no_default: + # Methods that behave like numeric_only=True and only got the numeric_only + # arg in 1.5.0 default to lib.no_default + result = True + elif numeric_only is None: + # Methods that had the numeric_only arg prior to 1.5.0 and try all columns + # first default to None + result = False + else: + result = numeric_only + return result + + +def deprecate_numeric_only_default(cls: type, name: str, deprecate_none: bool = False): + """Emit FutureWarning message for deprecation of numeric_only. + + See GH#46560 for details on the deprecation. + + Parameters + ---------- + cls : type + pandas type that is generating the warning. + name : str + Name of the method that is generating the warning. + deprecate_none : bool, default False + Whether to also warn about the deprecation of specifying ``numeric_only=None``. + """ + if name in ["all", "any"]: + arg_name = "bool_only" + else: + arg_name = "numeric_only" + + msg = ( + f"The default value of {arg_name} in {cls.__name__}.{name} is " + "deprecated. In a future version, it will default to False. " + ) + if deprecate_none: + msg += f"In addition, specifying '{arg_name}=None' is deprecated. " + msg += ( + f"Select only valid columns or specify the value of {arg_name} to silence " + "this warning." + ) + + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ef5e6dd1d6757..84ea8df0b9b20 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9833,7 +9833,7 @@ def corr( self, method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", min_periods: int = 1, - numeric_only: bool = True, + numeric_only: bool | lib.NoDefault = lib.no_default, ) -> DataFrame: """ Compute pairwise correlation of columns, excluding NA/null values. @@ -9859,6 +9859,10 @@ def corr( .. versionadded:: 1.5.0 + .. deprecated:: 1.5.0 + The default value of ``numeric_only`` will be ``False`` in a future + version of pandas. + Returns ------- DataFrame @@ -9897,10 +9901,11 @@ def corr( dogs 1.0 NaN cats NaN 1.0 """ # noqa:E501 - if numeric_only: - data = self._get_numeric_data() - else: - data = self + numeric_only_bool = com.resolve_numeric_only(numeric_only) + data = self._get_numeric_data() if numeric_only_bool else self + if numeric_only is lib.no_default and len(data.columns) < len(self.columns): + com.deprecate_numeric_only_default(type(self), "corr") + cols = data.columns idx = cols.copy() mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) @@ -9946,7 +9951,7 @@ def cov( self, min_periods: int | None = None, ddof: int | None = 1, - numeric_only: bool = True, + numeric_only: bool | lib.NoDefault = lib.no_default, ) -> DataFrame: """ Compute pairwise covariance of columns, excluding NA/null values. @@ -9983,6 +9988,10 @@ def cov( .. versionadded:: 1.5.0 + .. deprecated:: 1.5.0 + The default value of ``numeric_only`` will be ``False`` in a future + version of pandas. + Returns ------- DataFrame @@ -10051,10 +10060,11 @@ def cov( b NaN 1.248003 0.191417 c -0.150812 0.191417 0.895202 """ - if numeric_only: - data = self._get_numeric_data() - else: - data = self + numeric_only_bool = com.resolve_numeric_only(numeric_only) + data = self._get_numeric_data() if numeric_only_bool else self + if numeric_only is lib.no_default and len(data.columns) < len(self.columns): + com.deprecate_numeric_only_default(type(self), "cov") + cols = data.columns idx = cols.copy() mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) @@ -10077,7 +10087,7 @@ def corrwith( axis: Axis = 0, drop=False, method="pearson", - numeric_only: bool = True, + numeric_only: bool | lib.NoDefault = lib.no_default, ) -> Series: """ Compute pairwise correlation. @@ -10110,6 +10120,10 @@ def corrwith( .. versionadded:: 1.5.0 + .. deprecated:: 1.5.0 + The default value of ``numeric_only`` will be ``False`` in a future + version of pandas. + Returns ------- Series @@ -10141,10 +10155,10 @@ def corrwith( dtype: float64 """ # noqa:E501 axis = self._get_axis_number(axis) - if numeric_only: - this = self._get_numeric_data() - else: - this = self + numeric_only_bool = com.resolve_numeric_only(numeric_only) + this = self._get_numeric_data() if numeric_only_bool else self + if numeric_only is lib.no_default and len(this.columns) < len(self.columns): + com.deprecate_numeric_only_default(type(self), "corrwith") # GH46174: when other is a Series object and axis=0, we achieve a speedup over # passing .corr() to .apply() by taking the columns as ndarrays and iterating @@ -10396,7 +10410,6 @@ def _reduce( filter_type=None, **kwds, ): - assert filter_type is None or filter_type == "bool", filter_type out_dtype = "bool" if filter_type == "bool" else None @@ -10451,6 +10464,7 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data + numeric_only_bool = com.resolve_numeric_only(numeric_only) if numeric_only is not None or axis == 0: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. @@ -10458,7 +10472,7 @@ def _get_data() -> DataFrame: # dtypes are unambiguous can be handled with BlockManager.reduce # Case with EAs see GH#35881 df = self - if numeric_only is True: + if numeric_only_bool: df = _get_data() if axis == 1: df = df.T @@ -10479,16 +10493,8 @@ def _get_data() -> DataFrame: if numeric_only is None and out.shape[0] != df.shape[1]: # columns have been dropped GH#41480 - arg_name = "numeric_only" - if name in ["all", "any"]: - arg_name = "bool_only" - warnings.warn( - "Dropping of nuisance columns in DataFrame reductions " - f"(with '{arg_name}=None') is deprecated; in a future " - "version this will raise TypeError. Select only valid " - "columns before calling the reduction.", - FutureWarning, - stacklevel=find_stack_level(), + com.deprecate_numeric_only_default( + type(self), name, deprecate_none=True ) return out @@ -10776,6 +10782,11 @@ def quantile( numeric_only : bool, default True If False, the quantile of datetime and timedelta data will be computed as well. + + .. deprecated:: 1.5.0 + The default value of ``numeric_only`` will be ``False`` in a future + version of pandas. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: @@ -10833,15 +10844,8 @@ def quantile( axis = self._get_axis_number(axis) any_not_numeric = any(not is_numeric_dtype(x) for x in self.dtypes) if numeric_only is no_default and any_not_numeric: - warnings.warn( - "In future versions of pandas, numeric_only will be set to " - "False by default, and the datetime/timedelta columns will " - "be considered in the results. To not consider these columns" - "specify numeric_only=True.", - FutureWarning, - stacklevel=find_stack_level(), - ) - numeric_only = True + com.deprecate_numeric_only_default(type(self), "quantile") + numeric_only = com.resolve_numeric_only(numeric_only) if not is_list_like(q): # BlockManager.quantile expects listlike, so we wrap and unwrap here diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c615216240d60..1a31a50606c2c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11554,6 +11554,11 @@ def _doc_params(cls): numeric_only : bool, default None Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series. + + .. deprecated:: 1.5.0 + Specifying ``numeric_only=None`` is deprecated. The default value will be + ``False`` in a future version of pandas. + {min_count}\ **kwargs Additional keyword arguments to be passed to the function. @@ -11584,6 +11589,10 @@ def _doc_params(cls): Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series. + .. deprecated:: 1.5.0 + Specifying ``numeric_only=None`` is deprecated. The default value will be + ``False`` in a future version of pandas. + Returns ------- {name1} or {name2} (if level specified) \ diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 3a86aa05fb227..2f0a4195d2f74 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -41,7 +41,10 @@ def test_cov(self, float_frame, float_string_frame): tm.assert_almost_equal(result["A"]["C"], expected) # exclude non-numeric types - result = float_string_frame.cov() + with tm.assert_produces_warning( + FutureWarning, match="The default value of numeric_only" + ): + result = float_string_frame.cov() expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() tm.assert_frame_equal(result, expected) @@ -116,7 +119,10 @@ def test_corr_scipy_method(self, float_frame, method): def test_corr_non_numeric(self, float_string_frame): # exclude non-numeric types - result = float_string_frame.corr() + with tm.assert_produces_warning( + FutureWarning, match="The default value of numeric_only" + ): + result = float_string_frame.corr() expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() tm.assert_frame_equal(result, expected) @@ -307,11 +313,17 @@ def test_corrwith_with_objects(self): df1["obj"] = "foo" df2["obj"] = "bar" - result = df1.corrwith(df2) + with tm.assert_produces_warning( + FutureWarning, match="The default value of numeric_only" + ): + result = df1.corrwith(df2) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) tm.assert_series_equal(result, expected) - result = df1.corrwith(df2, axis=1) + with tm.assert_produces_warning( + FutureWarning, match="The default value of numeric_only" + ): + result = df1.corrwith(df2, axis=1) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 41deeec7c4b57..7f2a13862f4ed 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1,11 +1,13 @@ from datetime import timedelta from decimal import Decimal +import inspect import re from dateutil.tz import tzlocal import numpy as np import pytest +from pandas._libs import lib from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -1752,7 +1754,9 @@ def test_groupby_regular_arithmetic_equivalent(meth): def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) - with tm.assert_produces_warning(FutureWarning, match="Dropping of nuisance"): + with tm.assert_produces_warning( + FutureWarning, match="The default value of numeric_only" + ): result = df.sum() expected = Series([1, 1.1, "foo"], index=list("abc")) tm.assert_series_equal(result, expected) @@ -1786,3 +1790,60 @@ def test_reduction_axis_none_deprecation(method): expected = meth() tm.assert_series_equal(res, expected) tm.assert_series_equal(res, meth(axis=0)) + + +@pytest.mark.parametrize( + "kernel", + [ + "corr", + "corrwith", + "count", + "cov", + "idxmax", + "idxmin", + "kurt", + "kurt", + "max", + "mean", + "median", + "min", + "mode", + "prod", + "prod", + "quantile", + "sem", + "skew", + "std", + "sum", + "var", + ], +) +def test_numeric_only_deprecation(kernel): + # GH#46852 + df = DataFrame({"a": [1, 2, 3], "b": object}) + args = (df,) if kernel == "corrwith" else () + signature = inspect.signature(getattr(DataFrame, kernel)) + default = signature.parameters["numeric_only"].default + assert default is not True + + if kernel in ("idxmax", "idxmin"): + # kernels that default to numeric_only=False and fail on nuisance columns + assert default is False + with pytest.raises(TypeError, match="not allowed for this dtype"): + getattr(df, kernel)(*args) + else: + if default is None or default is lib.no_default: + expected = getattr(df[["a"]], kernel)(*args) + warn = FutureWarning + else: + # default must be False and works on any nuisance columns + expected = getattr(df, kernel)(*args) + if kernel == "mode": + assert "b" in expected.columns + else: + assert "b" in expected.index + warn = None + msg = f"The default value of numeric_only in DataFrame.{kernel}" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(df, kernel)(*args) + tm.assert_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index a5834dd237c01..b5bae4759090a 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -807,7 +807,13 @@ def test_frame_downsample_method(method, numeric_only, expected_data): resampled = df.resample("Y") func = getattr(resampled, method) - result = func(numeric_only=numeric_only) + if method == "prod" and numeric_only is not True: + warn = FutureWarning + else: + warn = None + msg = "Dropping invalid columns in DataFrameGroupBy.prod is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = func(numeric_only=numeric_only) expected = DataFrame(expected_data, index=expected_index) tm.assert_frame_equal(result, expected) From 57f0a3d274482011ffaf5fb1a938ab870a61fed2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 30 Apr 2022 11:48:07 -0400 Subject: [PATCH 2/3] Revert whatsnew notable_bug_fix2 removal --- doc/source/whatsnew/v1.5.0.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8984c0895269f..aa42d4236484b 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -194,6 +194,11 @@ did not have the same index as the input. df.groupby('a', dropna=True).transform('ffill') df.groupby('a', dropna=True).transform(lambda x: x) +.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2: + +notable_bug_fix2 +^^^^^^^^^^^^^^^^ + .. --------------------------------------------------------------------------- .. _whatsnew_150.api_breaking: From 2155fa1b61e4d485b8c303e1b9a87a822e6365ae Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 30 Apr 2022 18:22:35 -0400 Subject: [PATCH 3/3] mypy fixup --- pandas/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 5e03f82a2b667..098b501cc95c9 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -660,7 +660,7 @@ def resolve_numeric_only(numeric_only: bool | None | lib.NoDefault) -> bool: # first default to None result = False else: - result = numeric_only + result = cast(bool, numeric_only) return result