ENH: Add numeric_only to certain groupby ops (#46728)

rhshadrach · web-flow · commit 4a072fa0d0d3 · 2022-04-29T20:46:05.000-04:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -120,7 +120,7 @@ Other enhancements
 - :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
 - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
 - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
-- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
+- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.GroupBy.quantile` (:issue:`46560`)
 - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`)
 - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`)
 - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -10605,11 +10605,17 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
         """
         return self.apply(Series.nunique, axis=axis, dropna=dropna)
 
-    @doc(_shared_docs["idxmin"])
-    def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:
+    @doc(_shared_docs["idxmin"], numeric_only_default="False")
+    def idxmin(
+        self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
+    ) -> Series:
         axis = self._get_axis_number(axis)
+        if numeric_only:
+            data = self._get_numeric_data()
+        else:
+            data = self
 
-        res = self._reduce(
+        res = data._reduce(
             nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
         )
         indices = res._values
@@ -10619,15 +10625,22 @@ def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:
         # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
         assert isinstance(indices, np.ndarray)  # for mypy
 
-        index = self._get_axis(axis)
+        index = data._get_axis(axis)
         result = [index[i] if i >= 0 else np.nan for i in indices]
-        return self._constructor_sliced(result, index=self._get_agg_axis(axis))
+        return data._constructor_sliced(result, index=data._get_agg_axis(axis))
+
+    @doc(_shared_docs["idxmax"], numeric_only_default="False")
+    def idxmax(
+        self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
+    ) -> Series:
 
-    @doc(_shared_docs["idxmax"])
-    def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:
         axis = self._get_axis_number(axis)
+        if numeric_only:
+            data = self._get_numeric_data()
+        else:
+            data = self
 
-        res = self._reduce(
+        res = data._reduce(
             nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
         )
         indices = res._values
@@ -10637,9 +10650,9 @@ def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:
         # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
         assert isinstance(indices, np.ndarray)  # for mypy
 
-        index = self._get_axis(axis)
+        index = data._get_axis(axis)
         result = [index[i] if i >= 0 else np.nan for i in indices]
-        return self._constructor_sliced(result, index=self._get_agg_axis(axis))
+        return data._constructor_sliced(result, index=data._get_agg_axis(axis))
 
     def _get_agg_axis(self, axis_num: int) -> Index:
         """
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1555,10 +1555,14 @@ def nunique(self, dropna: bool = True) -> DataFrame:
 
         return results
 
-    @doc(_shared_docs["idxmax"])
-    def idxmax(self, axis=0, skipna: bool = True):
+    @doc(
+        _shared_docs["idxmax"],
+        numeric_only_default="True for axis=0, False for axis=1",
+    )
+    def idxmax(self, axis=0, skipna: bool = True, numeric_only: bool | None = None):
         axis = DataFrame._get_axis_number(axis)
-        numeric_only = None if axis == 0 else False
+        if numeric_only is None:
+            numeric_only = None if axis == 0 else False
 
         def func(df):
             # NB: here we use numeric_only=None, in DataFrame it is False GH#38217
@@ -1577,13 +1581,17 @@ def func(df):
         func.__name__ = "idxmax"
         return self._python_apply_general(func, self._obj_with_exclusions)
 
-    @doc(_shared_docs["idxmin"])
-    def idxmin(self, axis=0, skipna: bool = True):
+    @doc(
+        _shared_docs["idxmin"],
+        numeric_only_default="True for axis=0, False for axis=1",
+    )
+    def idxmin(self, axis=0, skipna: bool = True, numeric_only: bool | None = None):
         axis = DataFrame._get_axis_number(axis)
-        numeric_only = None if axis == 0 else False
+        if numeric_only is None:
+            numeric_only = None if axis == 0 else False
 
         def func(df):
-            # NB: here we use numeric_only=None, in DataFrame it is False GH#38217
+            # NB: here we use numeric_only=None, in DataFrame it is False GH#46560
             res = df._reduce(
                 nanops.nanargmin,
                 "argmin",
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1502,7 +1502,7 @@ def _python_apply_general(
         )
 
     @final
-    def _python_agg_general(self, func, *args, **kwargs):
+    def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs):
         func = com.is_builtin_func(func)
         f = lambda x: func(x, *args, **kwargs)
 
@@ -1520,6 +1520,8 @@ def _python_agg_general(self, func, *args, **kwargs):
                 # if this function is invalid for this dtype, we will ignore it.
                 result = self.grouper.agg_series(obj, f)
             except TypeError:
+                if raise_on_typeerror:
+                    raise
                 warn_dropping_nuisance_columns_deprecated(type(self), "agg")
                 continue
 
@@ -1593,7 +1595,12 @@ def _agg_py_fallback(
 
     @final
     def _cython_agg_general(
-        self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
+        self,
+        how: str,
+        alt: Callable,
+        numeric_only: bool,
+        min_count: int = -1,
+        ignore_failures: bool = True,
     ):
         # Note: we never get here with how="ohlc" for DataFrameGroupBy;
         #  that goes through SeriesGroupBy
@@ -1629,7 +1636,7 @@ def array_func(values: ArrayLike) -> ArrayLike:
 
         # TypeError -> we may have an exception in trying to aggregate
         #  continue and exclude the block
-        new_mgr = data.grouped_reduce(array_func, ignore_failures=True)
+        new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)
 
         if not is_ser and len(new_mgr) < len(data):
             warn_dropping_nuisance_columns_deprecated(type(self), how)
@@ -2041,6 +2048,7 @@ def std(
         ddof: int = 1,
         engine: str | None = None,
         engine_kwargs: dict[str, bool] | None = None,
+        numeric_only: bool | lib.NoDefault = lib.no_default,
     ):
         """
         Compute standard deviation of groups, excluding missing values.
@@ -2069,6 +2077,11 @@ def std(
 
             .. versionadded:: 1.4.0
 
+        numeric_only : bool, default True
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionadded:: 1.5.0
+
         Returns
         -------
         Series or DataFrame
@@ -2081,8 +2094,9 @@ def std(
         else:
             return self._get_cythonized_result(
                 libgroupby.group_var,
-                needs_counts=True,
                 cython_dtype=np.dtype(np.float64),
+                numeric_only=numeric_only,
+                needs_counts=True,
                 post_processing=lambda vals, inference: np.sqrt(vals),
                 ddof=ddof,
             )
@@ -2095,6 +2109,7 @@ def var(
         ddof: int = 1,
         engine: str | None = None,
         engine_kwargs: dict[str, bool] | None = None,
+        numeric_only: bool | lib.NoDefault = lib.no_default,
     ):
         """
         Compute variance of groups, excluding missing values.
@@ -2123,6 +2138,11 @@ def var(
 
             .. versionadded:: 1.4.0
 
+        numeric_only : bool, default True
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionadded:: 1.5.0
+
         Returns
         -------
         Series or DataFrame
@@ -2133,22 +2153,25 @@ def var(
 
             return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
         else:
+            numeric_only_bool = self._resolve_numeric_only(numeric_only)
             if ddof == 1:
-                numeric_only = self._resolve_numeric_only(lib.no_default)
                 return self._cython_agg_general(
                     "var",
                     alt=lambda x: Series(x).var(ddof=ddof),
-                    numeric_only=numeric_only,
+                    numeric_only=numeric_only_bool,
+                    ignore_failures=numeric_only is lib.no_default,
                 )
             else:
                 func = lambda x: x.var(ddof=ddof)
                 with self._group_selection_context():
-                    return self._python_agg_general(func)
+                    return self._python_agg_general(
+                        func, raise_on_typeerror=not numeric_only_bool
+                    )
 
     @final
     @Substitution(name="groupby")
     @Appender(_common_see_also)
-    def sem(self, ddof: int = 1):
+    def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default):
         """
         Compute standard error of the mean of groups, excluding missing values.
 
@@ -2159,12 +2182,17 @@ def sem(self, ddof: int = 1):
         ddof : int, default 1
             Degrees of freedom.
 
+        numeric_only : bool, default True
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionadded:: 1.5.0
+
         Returns
         -------
         Series or DataFrame
             Standard error of the mean of values within each group.
         """
-        result = self.std(ddof=ddof)
+        result = self.std(ddof=ddof, numeric_only=numeric_only)
         if result.ndim == 1:
             result /= np.sqrt(self.count())
         else:
@@ -2979,7 +3007,12 @@ def nth(
         return result
 
     @final
-    def quantile(self, q=0.5, interpolation: str = "linear"):
+    def quantile(
+        self,
+        q=0.5,
+        interpolation: str = "linear",
+        numeric_only: bool | lib.NoDefault = lib.no_default,
+    ):
         """
         Return group values at the given quantile, a la numpy.percentile.
 
@@ -2989,6 +3022,10 @@ def quantile(self, q=0.5, interpolation: str = "linear"):
             Value(s) between 0 and 1 providing the quantile(s) to compute.
         interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
             Method to use when the desired quantile falls between two points.
+        numeric_only : bool, default True
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionadded:: 1.5.0
 
         Returns
         -------
@@ -3013,6 +3050,7 @@ def quantile(self, q=0.5, interpolation: str = "linear"):
         a    2.0
         b    3.0
         """
+        numeric_only_bool = self._resolve_numeric_only(numeric_only)
 
         def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]:
             if is_object_dtype(vals):
@@ -3106,9 +3144,15 @@ def blk_func(values: ArrayLike) -> ArrayLike:
         obj = self._obj_with_exclusions
         is_ser = obj.ndim == 1
         mgr = self._get_data_to_aggregate()
-
-        res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
-        if not is_ser and len(res_mgr.items) != len(mgr.items):
+        data = mgr.get_numeric_data() if numeric_only_bool else mgr
+        ignore_failures = numeric_only_bool
+        res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)
+
+        if (
+            numeric_only is lib.no_default
+            and not is_ser
+            and len(res_mgr.items) != len(mgr.items)
+        ):
             warn_dropping_nuisance_columns_deprecated(type(self), "quantile")
 
             if len(res_mgr.items) == 0:
diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
@@ -749,6 +749,10 @@
     skipna : bool, default True
         Exclude NA/null values. If an entire row/column is NA, the result
         will be NA.
+    numeric_only : bool, default {numeric_only_default}
+        Include only `float`, `int` or `boolean` data.
+
+        .. versionadded:: 1.5.0
 
     Returns
     -------
@@ -812,6 +816,10 @@
     skipna : bool, default True
         Exclude NA/null values. If an entire row/column is NA, the result
         will be NA.
+    numeric_only : bool, default {numeric_only_default}
+        Include only `float`, `int` or `boolean` data.
+
+        .. versionadded:: 1.5.0
 
     Returns
     -------
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -897,6 +897,17 @@ def test_idxmin(self, float_frame, int_frame, skipna, axis):
             expected = df.apply(Series.idxmin, axis=axis, skipna=skipna)
             tm.assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize("numeric_only", [True, False])
+    def test_idxmin_numeric_only(self, numeric_only):
+        df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
+        if numeric_only:
+            result = df.idxmin(numeric_only=numeric_only)
+            expected = Series([2, 1], index=["a", "b"])
+            tm.assert_series_equal(result, expected)
+        else:
+            with pytest.raises(TypeError, match="not allowed for this dtype"):
+                df.idxmin(numeric_only=numeric_only)
+
     def test_idxmin_axis_2(self, float_frame):
         frame = float_frame
         msg = "No axis named 2 for object type DataFrame"
@@ -914,6 +925,17 @@ def test_idxmax(self, float_frame, int_frame, skipna, axis):
             expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
             tm.assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize("numeric_only", [True, False])
+    def test_idxmax_numeric_only(self, numeric_only):
+        df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
+        if numeric_only:
+            result = df.idxmax(numeric_only=numeric_only)
+            expected = Series([1, 0], index=["a", "b"])
+            tm.assert_series_equal(result, expected)
+        else:
+            with pytest.raises(TypeError, match="not allowed for this dtype"):
+                df.idxmin(numeric_only=numeric_only)
+
     def test_idxmax_axis_2(self, float_frame):
         frame = float_frame
         msg = "No axis named 2 for object type DataFrame"
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -495,8 +495,9 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i):
         ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}),
     ],
 )
+@pytest.mark.parametrize("numeric_only", [True, False])
 @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
-def test_idxmin_idxmax_returns_int_types(func, values):
+def test_idxmin_idxmax_returns_int_types(func, values, numeric_only):
     # GH 25444
     df = DataFrame(
         {
@@ -513,12 +514,15 @@ def test_idxmin_idxmax_returns_int_types(func, values):
     df["c_Integer"] = df["c_int"].astype("Int64")
     df["c_Floating"] = df["c_float"].astype("Float64")
 
-    result = getattr(df.groupby("name"), func)()
+    result = getattr(df.groupby("name"), func)(numeric_only=numeric_only)
 
     expected = DataFrame(values, index=Index(["A", "B"], name="name"))
-    expected["c_date_tz"] = expected["c_date"]
-    expected["c_timedelta"] = expected["c_date"]
-    expected["c_period"] = expected["c_date"]
+    if numeric_only:
+        expected = expected.drop(columns=["c_date"])
+    else:
+        expected["c_date_tz"] = expected["c_date"]
+        expected["c_timedelta"] = expected["c_date"]
+        expected["c_period"] = expected["c_date"]
     expected["c_Integer"] = expected["c_int"]
     expected["c_Floating"] = expected["c_float"]
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py