API: add numeric_only support to groupby agg

nicholas-ys-tan · nicholas-ys-tan · commit 181f5932ff2f · 2024-04-03T23:52:06.000+11:00
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -303,13 +303,14 @@ def agg_list_like(self) -> DataFrame | Series:
         -------
         Result of aggregation.
         """
-        return self.agg_or_apply_list_like(op_name="agg")
+        kwargs = self.kwargs
+        return self.agg_or_apply_list_like(op_name="agg", **kwargs)
 
     def compute_list_like(
         self,
         op_name: Literal["agg", "apply"],
         selected_obj: Series | DataFrame,
-        kwargs: dict[str, Any],
+        **kwargs: dict[str, Any],
     ) -> tuple[list[Hashable] | Index, list[Any]]:
         """
         Compute agg/apply results for like-like input.
@@ -333,7 +334,6 @@ def compute_list_like(
         """
         func = cast(list[AggFuncTypeBase], self.func)
         obj = self.obj
-
         results = []
         keys = []
 
@@ -348,7 +348,6 @@ def compute_list_like(
                 )
                 new_res = getattr(colg, op_name)(a, *args, **kwargs)
                 results.append(new_res)
-
                 # make sure we find a good name
                 name = com.get_callable_name(a) or a
                 keys.append(name)
@@ -691,10 +690,9 @@ def agg_axis(self) -> Index:
         return self.obj._get_agg_axis(self.axis)
 
     def agg_or_apply_list_like(
-        self, op_name: Literal["agg", "apply"]
+        self, op_name: Literal["agg", "apply"], numeric_only=False, **kwargs
     ) -> DataFrame | Series:
         obj = self.obj
-        kwargs = self.kwargs
 
         if op_name == "apply":
             if isinstance(self, FrameApply):
@@ -708,7 +706,6 @@ def agg_or_apply_list_like(
 
         if getattr(obj, "axis", 0) == 1:
             raise NotImplementedError("axis other than 0 is not supported")
-
         keys, results = self.compute_list_like(op_name, obj, kwargs)
         result = self.wrap_results_list_like(keys, results)
         return result
@@ -1485,28 +1482,24 @@ def transform(self):
         raise NotImplementedError
 
     def agg_or_apply_list_like(
-        self, op_name: Literal["agg", "apply"]
+        self, op_name: Literal["agg", "apply"], numeric_only=False, **kwargs
     ) -> DataFrame | Series:
         obj = self.obj
-        kwargs = self.kwargs
+
         if op_name == "apply":
             kwargs = {**kwargs, "by_row": False}
 
         if getattr(obj, "axis", 0) == 1:
             raise NotImplementedError("axis other than 0 is not supported")
 
-        if obj._selected_obj.ndim == 1:
-            # For SeriesGroupBy this matches _obj_with_exclusions
-            selected_obj = obj._selected_obj
-        else:
-            selected_obj = obj._obj_with_exclusions
-
+        mgr = obj._get_data_to_aggregate(numeric_only=numeric_only)
+        selected_obj = obj._wrap_agged_manager(mgr)
         # Only set as_index=True on groupby objects, not Window or Resample
         # that inherit from this class.
         with com.temp_setattr(
             obj, "as_index", True, condition=hasattr(obj, "as_index")
         ):
-            keys, results = self.compute_list_like(op_name, selected_obj, kwargs)
+            keys, results = self.compute_list_like(op_name, selected_obj, **kwargs)
         result = self.wrap_results_list_like(keys, results)
         return result
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1553,7 +1553,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
 
             else:
                 # try to treat as if we are passing a list
-                gba = GroupByApply(self, [func], args=(), kwargs={})
+                gba = GroupByApply(self, [func], args=args, kwargs=kwargs)
                 try:
                     result = gba.agg()
 
@@ -1582,15 +1582,15 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
 
     agg = aggregate
 
-    def _python_agg_general(self, func, *args, **kwargs):
+    def _python_agg_general(self, func, *args, numeric_only=False, **kwargs):
         f = lambda x: func(x, *args, **kwargs)
 
         if self.ngroups == 0:
             # e.g. test_evaluate_with_empty_groups different path gets different
             #  result dtype in empty case.
             return self._python_apply_general(f, self._selected_obj, is_agg=True)
-
-        obj = self._obj_with_exclusions
+        mgr = self._get_data_to_aggregate(numeric_only=numeric_only)
+        obj = self._wrap_agged_manager(mgr)
 
         if not len(obj.columns):
             # e.g. test_margins_no_values_no_cols
@@ -1605,19 +1605,19 @@ def _python_agg_general(self, func, *args, **kwargs):
         res.columns = obj.columns.copy(deep=False)
         return self._wrap_aggregated_output(res)
 
-    def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
+    def _aggregate_frame(self, func, *args, numeric_only=False, **kwargs) -> DataFrame:
         if self._grouper.nkeys != 1:
             raise AssertionError("Number of keys must be 1")
 
-        obj = self._obj_with_exclusions
-
+        mgr = self._get_data_to_aggregate(numeric_only=numeric_only)
+        data = self._wrap_agged_manager(mgr)
         result: dict[Hashable, NDFrame | np.ndarray] = {}
-        for name, grp_df in self._grouper.get_iterator(obj):
+        for name, grp_df in self._grouper.get_iterator(data):
             fres = func(grp_df, *args, **kwargs)
             result[name] = fres
 
         result_index = self._grouper.result_index
-        out = self.obj._constructor(result, index=obj.columns, columns=result_index)
+        out = self.obj._constructor(result, index=data.columns, columns=result_index)
         out = out.T
 
         return out
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -1663,3 +1663,53 @@ def func(x):
     msg = "length must not be 0"
     with pytest.raises(ValueError, match=msg):
         df.groupby("A", observed=False).agg(func)
+
+
+@pytest.mark.parametrize(
+    "aggfunc",
+    [
+        "mean",
+        np.mean,
+        ["sum", "mean"],
+        [np.sum, np.mean],
+        ["sum", np.mean],
+        lambda x: x.mean(),
+        {"A": "mean"},
+        {"A": "mean", "B": "sum"},
+        {"A": np.mean},
+    ],
+    ids=[
+        " string_mean ",
+        " numpy_mean ",
+        " list_of_str_and_str ",
+        " list_of_numpy_and_numpy ",
+        " list_of_str_and_numpy ",
+        " lambda ",
+        " dict_with_str ",
+        " dict with 2 vars ",
+        " dict with numpy",
+    ],
+)
+@pytest.mark.parametrize(
+    "groupers",
+    ["groupby1", "groupby2", ["groupby1", "groupby2"]],
+    ids=[" 1_grouper_str ", " 1_grouper_int ", " 2_groupers_str_and_int "],
+)
+@pytest.mark.parametrize(
+    "numeric_only", [True, None], ids=[" numeric_only True ", " no_numeric_only_arg "]
+)  # need to add other kwargs
+def test_different_combinations_of_groupby_agg(aggfunc, groupers, numeric_only):
+    df = DataFrame(
+        {
+            "A": [1, 2, 3, 4, 5],
+            "B": [10, 20, 30, 40, 50],
+            "groupby1": ["diamond", "diamond", "spade", "spade", "spade"],
+            "groupby2": [1, 1, 1, 2, 2],
+            "attr": ["a", "b", "c", "d", "e"],
+        }
+    )
+    if numeric_only or isinstance(aggfunc, dict):
+        df.groupby(by=groupers).agg(func=aggfunc, numeric_only=numeric_only)
+    else:
+        with pytest.raises(TypeError):
+            df.groupby(by=groupers).agg(func=aggfunc)
diff --git a/test.tar b/test.tar