DEPR: numeric_only default in DataFrame methods with None/True (#46906)

rhshadrach · web-flow · commit 11462d67cd7b · 2022-05-04T21:22:55.000-04:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -120,7 +120,7 @@ Other enhancements
 - :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
 - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
 - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
-- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.GroupBy.quantile` (:issue:`46560`)
+- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.DataFrameGroupBy.quantile` (:issue:`46560`)
 - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`)
 - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`)
 - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
@@ -463,6 +463,48 @@ As ``group_keys=True`` is the default value of :meth:`DataFrame.groupby` and
 raise a ``FutureWarning``. This can be silenced and the previous behavior
 retained by specifying ``group_keys=False``.
 
+.. _whatsnew_150.deprecations.numeric_only_default:
+
+``numeric_only`` default value
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Across the DataFrame operations such as ``min``, ``sum``, and ``idxmax``, the default
+value of the ``numeric_only`` argument, if it exists at all, was inconsistent.
+Furthermore, operations with the default value ``None`` can lead to surprising
+results. (:issue:`46560`)
+
+.. code-block:: ipython
+
+    In [1]: df = pd.DataFrame({"a": [1, 2], "b": ["x", "y"]})
+
+    In [2]: # Reading the next line without knowing the contents of df, one would
+            # expect the result to contain the products for both columns a and b.
+            df[["a", "b"]].prod()
+    Out[2]:
+    a    2
+    dtype: int64
+
+To avoid this behavior, the specifying the value ``numeric_only=None`` has been
+deprecated, and will be removed in a future version of pandas. In the future,
+all operations with a ``numeric_only`` argument will default to ``False``. Users
+should either call the operation only with columns that can be operated on, or
+specify ``numeric_only=True`` to operate only on Boolean, integer, and float columns.
+
+In order to support the transition to the new behavior, the following methods have
+gained the ``numeric_only`` argument.
+
+- :meth:`DataFrame.corr`
+- :meth:`DataFrame.corrwith`
+- :meth:`DataFrame.cov`
+- :meth:`DataFrame.idxmin`
+- :meth:`DataFrame.idxmax`
+- :meth:`.DataFrameGroupBy.idxmin`
+- :meth:`.DataFrameGroupBy.idxmax`
+- :meth:`.GroupBy.var`
+- :meth:`.GroupBy.std`
+- :meth:`.GroupBy.sem`
+- :meth:`.DataFrameGroupBy.quantile`
+
 .. _whatsnew_150.deprecations.other:
 
 Other Deprecations
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -635,3 +635,63 @@ def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]:
         list of column names with the None values replaced.
     """
     return [f"level_{i}" if name is None else name for i, name in enumerate(names)]
+
+
+def resolve_numeric_only(numeric_only: bool | None | lib.NoDefault) -> bool:
+    """Determine the Boolean value of numeric_only.
+
+    See GH#46560 for details on the deprecation.
+
+    Parameters
+    ----------
+    numeric_only : bool, None, or lib.no_default
+        Value passed to the method.
+
+    Returns
+    -------
+    Resolved value of numeric_only.
+    """
+    if numeric_only is lib.no_default:
+        # Methods that behave like numeric_only=True and only got the numeric_only
+        # arg in 1.5.0 default to lib.no_default
+        result = True
+    elif numeric_only is None:
+        # Methods that had the numeric_only arg prior to 1.5.0 and try all columns
+        # first default to None
+        result = False
+    else:
+        result = cast(bool, numeric_only)
+    return result
+
+
+def deprecate_numeric_only_default(cls: type, name: str, deprecate_none: bool = False):
+    """Emit FutureWarning message for deprecation of numeric_only.
+
+    See GH#46560 for details on the deprecation.
+
+    Parameters
+    ----------
+    cls : type
+        pandas type that is generating the warning.
+    name : str
+        Name of the method that is generating the warning.
+    deprecate_none : bool, default False
+        Whether to also warn about the deprecation of specifying ``numeric_only=None``.
+    """
+    if name in ["all", "any"]:
+        arg_name = "bool_only"
+    else:
+        arg_name = "numeric_only"
+
+    msg = (
+        f"The default value of {arg_name} in {cls.__name__}.{name} is "
+        "deprecated. In a future version, it will default to False. "
+    )
+    if deprecate_none:
+        msg += f"In addition, specifying '{arg_name}=None' is deprecated. "
+    msg += (
+        f"Select only valid columns or specify the value of {arg_name} to silence "
+        "this warning."
+    )
+
+    warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -9833,7 +9833,7 @@ def corr(
         self,
         method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
         min_periods: int = 1,
-        numeric_only: bool = True,
+        numeric_only: bool | lib.NoDefault = lib.no_default,
     ) -> DataFrame:
         """
         Compute pairwise correlation of columns, excluding NA/null values.
@@ -9859,6 +9859,10 @@ def corr(
 
             .. versionadded:: 1.5.0
 
+            .. deprecated:: 1.5.0
+                The default value of ``numeric_only`` will be ``False`` in a future
+                version of pandas.
+
         Returns
         -------
         DataFrame
@@ -9897,10 +9901,11 @@ def corr(
         dogs   1.0   NaN
         cats   NaN   1.0
         """  # noqa:E501
-        if numeric_only:
-            data = self._get_numeric_data()
-        else:
-            data = self
+        numeric_only_bool = com.resolve_numeric_only(numeric_only)
+        data = self._get_numeric_data() if numeric_only_bool else self
+        if numeric_only is lib.no_default and len(data.columns) < len(self.columns):
+            com.deprecate_numeric_only_default(type(self), "corr")
+
         cols = data.columns
         idx = cols.copy()
         mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
@@ -9946,7 +9951,7 @@ def cov(
         self,
         min_periods: int | None = None,
         ddof: int | None = 1,
-        numeric_only: bool = True,
+        numeric_only: bool | lib.NoDefault = lib.no_default,
     ) -> DataFrame:
         """
         Compute pairwise covariance of columns, excluding NA/null values.
@@ -9983,6 +9988,10 @@ def cov(
 
             .. versionadded:: 1.5.0
 
+            .. deprecated:: 1.5.0
+                The default value of ``numeric_only`` will be ``False`` in a future
+                version of pandas.
+
         Returns
         -------
         DataFrame
@@ -10051,10 +10060,11 @@ def cov(
         b       NaN  1.248003  0.191417
         c -0.150812  0.191417  0.895202
         """
-        if numeric_only:
-            data = self._get_numeric_data()
-        else:
-            data = self
+        numeric_only_bool = com.resolve_numeric_only(numeric_only)
+        data = self._get_numeric_data() if numeric_only_bool else self
+        if numeric_only is lib.no_default and len(data.columns) < len(self.columns):
+            com.deprecate_numeric_only_default(type(self), "cov")
+
         cols = data.columns
         idx = cols.copy()
         mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
@@ -10077,7 +10087,7 @@ def corrwith(
         axis: Axis = 0,
         drop=False,
         method="pearson",
-        numeric_only: bool = True,
+        numeric_only: bool | lib.NoDefault = lib.no_default,
     ) -> Series:
         """
         Compute pairwise correlation.
@@ -10110,6 +10120,10 @@ def corrwith(
 
             .. versionadded:: 1.5.0
 
+            .. deprecated:: 1.5.0
+                The default value of ``numeric_only`` will be ``False`` in a future
+                version of pandas.
+
         Returns
         -------
         Series
@@ -10141,10 +10155,10 @@ def corrwith(
         dtype: float64
         """  # noqa:E501
         axis = self._get_axis_number(axis)
-        if numeric_only:
-            this = self._get_numeric_data()
-        else:
-            this = self
+        numeric_only_bool = com.resolve_numeric_only(numeric_only)
+        this = self._get_numeric_data() if numeric_only_bool else self
+        if numeric_only is lib.no_default and len(this.columns) < len(self.columns):
+            com.deprecate_numeric_only_default(type(self), "corrwith")
 
         # GH46174: when other is a Series object and axis=0, we achieve a speedup over
         # passing .corr() to .apply() by taking the columns as ndarrays and iterating
@@ -10396,7 +10410,6 @@ def _reduce(
         filter_type=None,
         **kwds,
     ):
-
         assert filter_type is None or filter_type == "bool", filter_type
         out_dtype = "bool" if filter_type == "bool" else None
 
@@ -10451,14 +10464,15 @@ def _get_data() -> DataFrame:
                 data = self._get_bool_data()
             return data
 
+        numeric_only_bool = com.resolve_numeric_only(numeric_only)
         if numeric_only is not None or axis == 0:
             # For numeric_only non-None and axis non-None, we know
             #  which blocks to use and no try/except is needed.
             #  For numeric_only=None only the case with axis==0 and no object
             #  dtypes are unambiguous can be handled with BlockManager.reduce
             # Case with EAs see GH#35881
             df = self
-            if numeric_only is True:
+            if numeric_only_bool:
                 df = _get_data()
             if axis == 1:
                 df = df.T
@@ -10479,16 +10493,8 @@ def _get_data() -> DataFrame:
 
             if numeric_only is None and out.shape[0] != df.shape[1]:
                 # columns have been dropped GH#41480
-                arg_name = "numeric_only"
-                if name in ["all", "any"]:
-                    arg_name = "bool_only"
-                warnings.warn(
-                    "Dropping of nuisance columns in DataFrame reductions "
-                    f"(with '{arg_name}=None') is deprecated; in a future "
-                    "version this will raise TypeError.  Select only valid "
-                    "columns before calling the reduction.",
-                    FutureWarning,
-                    stacklevel=find_stack_level(),
+                com.deprecate_numeric_only_default(
+                    type(self), name, deprecate_none=True
                 )
 
             return out
@@ -10776,6 +10782,11 @@ def quantile(
         numeric_only : bool, default True
             If False, the quantile of datetime and timedelta data will be
             computed as well.
+
+            .. deprecated:: 1.5.0
+                The default value of ``numeric_only`` will be ``False`` in a future
+                version of pandas.
+
         interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
             This optional parameter specifies the interpolation method to use,
             when the desired quantile lies between two data points `i` and `j`:
@@ -10833,15 +10844,8 @@ def quantile(
         axis = self._get_axis_number(axis)
         any_not_numeric = any(not is_numeric_dtype(x) for x in self.dtypes)
         if numeric_only is no_default and any_not_numeric:
-            warnings.warn(
-                "In future versions of pandas, numeric_only will be set to "
-                "False by default, and the datetime/timedelta columns will "
-                "be considered in the results. To not consider these columns"
-                "specify numeric_only=True.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
-            numeric_only = True
+            com.deprecate_numeric_only_default(type(self), "quantile")
+        numeric_only = com.resolve_numeric_only(numeric_only)
 
         if not is_list_like(q):
             # BlockManager.quantile expects listlike, so we wrap and unwrap here
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -11558,6 +11558,11 @@ def _doc_params(cls):
 numeric_only : bool, default None
     Include only float, int, boolean columns. If None, will attempt to use
     everything, then use only numeric data. Not implemented for Series.
+
+    .. deprecated:: 1.5.0
+        Specifying ``numeric_only=None`` is deprecated. The default value will be
+        ``False`` in a future version of pandas.
+
 {min_count}\
 **kwargs
     Additional keyword arguments to be passed to the function.
@@ -11588,6 +11593,10 @@ def _doc_params(cls):
     Include only float, int, boolean columns. If None, will attempt to use
     everything, then use only numeric data. Not implemented for Series.
 
+    .. deprecated:: 1.5.0
+        Specifying ``numeric_only=None`` is deprecated. The default value will be
+        ``False`` in a future version of pandas.
+
 Returns
 -------
 {name1} or {name2} (if level specified) \
diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
@@ -41,7 +41,10 @@ def test_cov(self, float_frame, float_string_frame):
         tm.assert_almost_equal(result["A"]["C"], expected)
 
         # exclude non-numeric types
-        result = float_string_frame.cov()
+        with tm.assert_produces_warning(
+            FutureWarning, match="The default value of numeric_only"
+        ):
+            result = float_string_frame.cov()
         expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov()
         tm.assert_frame_equal(result, expected)
 
@@ -116,7 +119,10 @@ def test_corr_scipy_method(self, float_frame, method):
 
     def test_corr_non_numeric(self, float_string_frame):
         # exclude non-numeric types
-        result = float_string_frame.corr()
+        with tm.assert_produces_warning(
+            FutureWarning, match="The default value of numeric_only"
+        ):
+            result = float_string_frame.corr()
         expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr()
         tm.assert_frame_equal(result, expected)
 
@@ -307,11 +313,17 @@ def test_corrwith_with_objects(self):
         df1["obj"] = "foo"
         df2["obj"] = "bar"
 
-        result = df1.corrwith(df2)
+        with tm.assert_produces_warning(
+            FutureWarning, match="The default value of numeric_only"
+        ):
+            result = df1.corrwith(df2)
         expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
         tm.assert_series_equal(result, expected)
 
-        result = df1.corrwith(df2, axis=1)
+        with tm.assert_produces_warning(
+            FutureWarning, match="The default value of numeric_only"
+        ):
+            result = df1.corrwith(df2, axis=1)
         expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
         tm.assert_series_equal(result, expected)
 
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py