DEPR: Change numeric_only to False in various groupby ops (#49892)

rhshadrach · web-flow · commit dd13032c8588 · 2022-11-28T14:52:55.000-08:00
* DEPR: Change numeric_only to False in various groupby ops

* Remove FIXME
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -572,7 +572,7 @@ Removal of prior version deprecations/changes
 - Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`)
 - Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`)
 - Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`)
-- Changed default of ``numeric_only`` to ``False`` in :meth:`.DataFrameGroupBy.sum` and :meth:`.DataFrameGroupBy.mean` (:issue:`46072`)
+- Changed default of ``numeric_only`` to ``False`` in various :class:`.DataFrameGroupBy` methods (:issue:`46072`)
 - Changed default of ``numeric_only`` to ``False`` in :class:`.Resampler` methods (:issue:`47177`)
 -
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -2220,7 +2220,7 @@ def skew(
         self,
         axis: Axis | None | lib.NoDefault = lib.no_default,
         skipna: bool = True,
-        numeric_only: bool | lib.NoDefault = lib.no_default,
+        numeric_only: bool = False,
         **kwargs,
     ) -> DataFrame:
         result = self._op_via_apply(
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2136,7 +2136,7 @@ def mean(
     @final
     @Substitution(name="groupby")
     @Appender(_common_see_also)
-    def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
+    def median(self, numeric_only: bool = False):
         """
         Compute median of groups, excluding missing values.
 
@@ -2173,7 +2173,7 @@ def std(
         ddof: int = 1,
         engine: str | None = None,
         engine_kwargs: dict[str, bool] | None = None,
-        numeric_only: bool | lib.NoDefault = lib.no_default,
+        numeric_only: bool = False,
     ):
         """
         Compute standard deviation of groups, excluding missing values.
@@ -2202,11 +2202,15 @@ def std(
 
             .. versionadded:: 1.4.0
 
-        numeric_only : bool, default True
+        numeric_only : bool, default False
             Include only `float`, `int` or `boolean` data.
 
             .. versionadded:: 1.5.0
 
+            .. versionchanged:: 2.0.0
+
+                numeric_only now defaults to ``False``.
+
         Returns
         -------
         Series or DataFrame
@@ -2236,7 +2240,6 @@ def std(
                 post_processing=lambda vals, inference: np.sqrt(vals),
                 ddof=ddof,
             )
-            self._maybe_warn_numeric_only_depr("std", result, numeric_only)
             return result
 
     @final
@@ -2247,7 +2250,7 @@ def var(
         ddof: int = 1,
         engine: str | None = None,
         engine_kwargs: dict[str, bool] | None = None,
-        numeric_only: bool | lib.NoDefault = lib.no_default,
+        numeric_only: bool = False,
     ):
         """
         Compute variance of groups, excluding missing values.
@@ -2276,11 +2279,15 @@ def var(
 
             .. versionadded:: 1.4.0
 
-        numeric_only : bool, default True
+        numeric_only : bool, default False
             Include only `float`, `int` or `boolean` data.
 
             .. versionadded:: 1.5.0
 
+            .. versionchanged:: 2.0.0
+
+                numeric_only now defaults to ``False``.
+
         Returns
         -------
         Series or DataFrame
@@ -2301,7 +2308,7 @@ def var(
     @final
     @Substitution(name="groupby")
     @Appender(_common_see_also)
-    def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default):
+    def sem(self, ddof: int = 1, numeric_only: bool = False):
         """
         Compute standard error of the mean of groups, excluding missing values.
 
@@ -2317,23 +2324,22 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default
 
             .. versionadded:: 1.5.0
 
+            .. versionchanged:: 2.0.0
+
+                numeric_only now defaults to ``False``.
+
         Returns
         -------
         Series or DataFrame
             Standard error of the mean of values within each group.
         """
         # Reolve numeric_only so that std doesn't warn
-        numeric_only_bool = self._resolve_numeric_only("sem", numeric_only, axis=0)
-        if (
-            numeric_only_bool
-            and self.obj.ndim == 1
-            and not is_numeric_dtype(self.obj.dtype)
-        ):
+        if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
             raise TypeError(
                 f"{type(self).__name__}.sem called with "
                 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
             )
-        result = self.std(ddof=ddof, numeric_only=numeric_only_bool)
+        result = self.std(ddof=ddof, numeric_only=numeric_only)
         self._maybe_warn_numeric_only_depr("sem", result, numeric_only)
 
         if result.ndim == 1:
@@ -2411,10 +2417,8 @@ def sum(
             return self._reindex_output(result, fill_value=0)
 
     @final
-    @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
-    def prod(
-        self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
-    ):
+    @doc(_groupby_agg_method_template, fname="prod", no=False, mc=0)
+    def prod(self, numeric_only: bool = False, min_count: int = 0):
         return self._agg_general(
             numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
         )
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -239,10 +239,7 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype
         [[1, 2, 3, 4, 5, 6]] * 3,
         columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]),
     ).astype({("a", "j"): dtype, ("b", "j"): dtype})
-    warn = FutureWarning if func == "std" else None
-    msg = "The default value of numeric_only"
-    with tm.assert_produces_warning(warn, match=msg):
-        result = df.groupby(level=1, axis=1).agg(func)
+    result = df.groupby(level=1, axis=1).agg(func)
     expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype(
         result_dtype_dict
     )
@@ -266,10 +263,7 @@ def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict):
         columns=Index([10, 20, 10, 20], name="x"),
         dtype="int64",
     ).astype({10: "Int64"})
-    warn = FutureWarning if func == "std" else None
-    msg = "The default value of numeric_only"
-    with tm.assert_produces_warning(warn, match=msg):
-        result = df.groupby("x", axis=1).agg(func)
+    result = df.groupby("x", axis=1).agg(func)
     expected = DataFrame(
         data=expected_data,
         index=Index([0, 1, 0], name="y"),
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -169,12 +169,9 @@ def test_averages(self, df, method):
             ],
         )
 
-        if method == "mean":
-            with pytest.raises(TypeError, match="[Cc]ould not convert"):
-                getattr(gb, method)()
-            result = getattr(gb, method)(numeric_only=True)
-        else:
-            result = getattr(gb, method)()
+        with pytest.raises(TypeError, match="[Cc]ould not convert"):
+            getattr(gb, method)()
+        result = getattr(gb, method)(numeric_only=True)
         tm.assert_frame_equal(result.reindex_like(expected), expected)
 
         expected_columns = expected.columns
@@ -276,11 +273,12 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
             )
             with pytest.raises(exception, match=msg):
                 getattr(gb, method)()
-        elif method in ("sum", "mean"):
+        elif method in ("sum", "mean", "median", "prod"):
             msg = "|".join(
                 [
                     "category type does not support sum operations",
-                    "Could not convert",
+                    "[Cc]ould not convert",
+                    "can't multiply sequence by non-int of type 'str'",
                 ]
             )
             with pytest.raises(exception, match=msg):
@@ -1397,18 +1395,18 @@ def test_groupby_sum_timedelta_with_nat():
         ("last", False, True),
         ("max", False, True),
         ("mean", False, True),
-        ("median", True, True),
+        ("median", False, True),
         ("min", False, True),
         ("nth", False, False),
         ("nunique", False, False),
         ("pct_change", False, False),
-        ("prod", True, True),
+        ("prod", False, True),
         ("quantile", True, True),
-        ("sem", True, True),
-        ("skew", True, True),
-        ("std", True, True),
+        ("sem", False, True),
+        ("skew", False, True),
+        ("std", False, True),
         ("sum", False, True),
-        ("var", True, True),
+        ("var", False, True),
     ],
 )
 @pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
@@ -1592,6 +1590,11 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
             warn_msg = ""
             err_category = TypeError
             err_msg = "Series.skew does not allow numeric_only=True with non-numeric"
+        elif groupby_func == "sem":
+            warn_category = None
+            warn_msg = ""
+            err_category = TypeError
+            err_msg = "called with numeric_only=True and dtype object"
         else:
             warn_category = FutureWarning
             warn_msg = "This will raise a TypeError"
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -4,7 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._libs import lib
 from pandas.compat import IS64
 from pandas.errors import (
     PerformanceWarning,
@@ -909,64 +908,37 @@ def test_keep_nuisance_agg(df, agg_function):
     "agg_function",
     ["sum", "mean", "prod", "std", "var", "sem", "median"],
 )
-@pytest.mark.parametrize("numeric_only", [lib.no_default, True, False])
+@pytest.mark.parametrize("numeric_only", [True, False])
 def test_omit_nuisance_agg(df, agg_function, numeric_only):
     # GH 38774, GH 38815
-    if numeric_only is lib.no_default or (not numeric_only and agg_function != "sum"):
-        # sum doesn't drop strings
-        warn = FutureWarning
-    else:
-        warn = None
-
     grouped = df.groupby("A")
 
     no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
-    if agg_function in no_drop_nuisance and numeric_only is False:
+    if agg_function in no_drop_nuisance and not numeric_only:
         # Added numeric_only as part of GH#46560; these do not drop nuisance
         # columns when numeric_only is False
         klass = ValueError if agg_function in ("std", "sem") else TypeError
         msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
         with pytest.raises(klass, match=msg):
             getattr(grouped, agg_function)(numeric_only=numeric_only)
     else:
-        if numeric_only is lib.no_default:
-            msg = (
-                f"The default value of numeric_only in DataFrameGroupBy.{agg_function}"
-            )
-        else:
-            msg = "Dropping invalid columns"
-        with tm.assert_produces_warning(warn, match=msg):
-            result = getattr(grouped, agg_function)(numeric_only=numeric_only)
-        if (
-            (numeric_only is lib.no_default or not numeric_only)
-            # These methods drop non-numeric columns even when numeric_only is False
-            and agg_function not in ("mean", "prod", "median")
-        ):
+        result = getattr(grouped, agg_function)(numeric_only=numeric_only)
+        if not numeric_only and agg_function == "sum":
+            # sum is successful on column B
             columns = ["A", "B", "C", "D"]
         else:
             columns = ["A", "C", "D"]
-        if agg_function == "sum" and numeric_only is False:
-            # sum doesn't drop nuisance string columns
-            warn = None
-        elif agg_function in ("sum", "std", "var", "sem") and numeric_only is not True:
-            warn = FutureWarning
-        else:
-            warn = None
-        msg = "The default value of numeric_only"
-        with tm.assert_produces_warning(warn, match=msg):
-            expected = getattr(df.loc[:, columns].groupby("A"), agg_function)(
-                numeric_only=numeric_only
-            )
+        expected = getattr(df.loc[:, columns].groupby("A"), agg_function)(
+            numeric_only=numeric_only
+        )
         tm.assert_frame_equal(result, expected)
 
 
-def test_omit_nuisance_warnings(df):
+def test_raise_on_nuisance_python_single(df):
     # GH 38815
-    with tm.assert_produces_warning(FutureWarning, filter_level="always"):
-        grouped = df.groupby("A")
-        result = grouped.skew()
-        expected = df.loc[:, ["A", "C", "D"]].groupby("A").skew()
-        tm.assert_frame_equal(result, expected)
+    grouped = df.groupby("A")
+    with pytest.raises(TypeError, match="could not convert"):
+        grouped.skew()
 
 
 def test_raise_on_nuisance_python_multiple(three_group):
@@ -2012,14 +1984,9 @@ def get_result(**kwargs):
             if df.dtypes[0].kind == "M":
                 # GH#41291
                 # datetime64 -> prod and sum are invalid
-                if op == "sum":
-                    with pytest.raises(
-                        TypeError, match="datetime64 type does not support"
-                    ):
-                        get_result()
-                    result = get_result(numeric_only=True)
-                else:
-                    result = get_result()
+                with pytest.raises(TypeError, match="datetime64 type does not support"):
+                    get_result()
+                result = get_result(numeric_only=True)
 
                 # with numeric_only=True, these are dropped, and we get
                 # an empty DataFrame back
@@ -2030,14 +1997,9 @@ def get_result(**kwargs):
             elif isinstance(values, Categorical):
                 # GH#41291
                 # Categorical doesn't implement sum or prod
-                if op == "sum":
-                    with pytest.raises(
-                        TypeError, match="category type does not support"
-                    ):
-                        get_result()
-                    result = get_result(numeric_only=True)
-                else:
-                    result = get_result()
+                with pytest.raises(TypeError, match="category type does not support"):
+                    get_result()
+                result = get_result(numeric_only=True)
 
                 # with numeric_only=True, these are dropped, and we get
                 # an empty DataFrame back
@@ -2053,24 +2015,22 @@ def get_result(**kwargs):
                 return
 
             elif df.dtypes[0] == object:
-                # FIXME: the test is actually wrong here, xref #41341
                 result = get_result()
-                # In this case we have list-of-list, will raise TypeError,
-                # and subsequently be dropped as nuisance columns
-                if op == "sum":
-                    expected = df.set_index(keys)[["C"]]
-                else:
-                    expected = df.set_index(keys)[[]]
+                expected = df.set_index(keys)[["C"]]
                 tm.assert_equal(result, expected)
                 return
 
-        if (
-            op in ["min", "max", "skew"]
-            and isinstance(values, Categorical)
-            and len(keys) == 1
+        if (op in ["min", "max", "skew"] and isinstance(values, Categorical)) or (
+            op == "skew" and df.dtypes[0].kind == "M"
         ):
-            if op in ("min", "max"):
-                with pytest.raises(TypeError, match="Categorical is not ordered"):
+            if op == "skew" or len(keys) == 1:
+                msg = "|".join(
+                    [
+                        "Categorical is not ordered",
+                        "does not support reduction",
+                    ]
+                )
+                with pytest.raises(TypeError, match=msg):
                     get_result()
                 return
             # Categorical doesn't implement, so with numeric_only=True
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py

Original file line number	Diff line number	Diff line change
`@@ -572,7 +572,7 @@ Removal of prior version deprecations/changes`
`572`	`572`	- Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`)
`573`	`573`	- Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`)
`574`	`574`	- Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`)
`575`		-- Changed default of ``numeric_only`` to ``False`` in :meth:`.DataFrameGroupBy.sum` and :meth:`.DataFrameGroupBy.mean` (:issue:`46072`)
	`575`	+- Changed default of ``numeric_only`` to ``False`` in various :class:`.DataFrameGroupBy` methods (:issue:`46072`)
`576`	`576`	- Changed default of ``numeric_only`` to ``False`` in :class:`.Resampler` methods (:issue:`47177`)
`577`	`577`	`-`
`578`	`578`