diff --git a/pandas/core/apply.py b/pandas/core/apply.py index e8df24850f7a8..e0e78bc7fd2fc 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -303,13 +303,14 @@ def agg_list_like(self) -> DataFrame | Series: ------- Result of aggregation. """ - return self.agg_or_apply_list_like(op_name="agg") + kwargs = self.kwargs + return self.agg_or_apply_list_like(op_name="agg", **kwargs) def compute_list_like( self, op_name: Literal["agg", "apply"], selected_obj: Series | DataFrame, - kwargs: dict[str, Any], + **kwargs: dict[str, Any], ) -> tuple[list[Hashable] | Index, list[Any]]: """ Compute agg/apply results for like-like input. @@ -348,7 +349,6 @@ def compute_list_like( ) new_res = getattr(colg, op_name)(a, *args, **kwargs) results.append(new_res) - # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) @@ -691,10 +691,9 @@ def agg_axis(self) -> Index: return self.obj._get_agg_axis(self.axis) def agg_or_apply_list_like( - self, op_name: Literal["agg", "apply"] + self, op_name: Literal["agg", "apply"], numeric_only=False, **kwargs ) -> DataFrame | Series: obj = self.obj - kwargs = self.kwargs if op_name == "apply": if isinstance(self, FrameApply): @@ -709,7 +708,7 @@ def agg_or_apply_list_like( if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") - keys, results = self.compute_list_like(op_name, obj, kwargs) + keys, results = self.compute_list_like(op_name, obj, **kwargs) result = self.wrap_results_list_like(keys, results) return result @@ -1485,28 +1484,27 @@ def transform(self): raise NotImplementedError def agg_or_apply_list_like( - self, op_name: Literal["agg", "apply"] + self, op_name: Literal["agg", "apply"], numeric_only=False, **kwargs ) -> DataFrame | Series: obj = self.obj - kwargs = self.kwargs if op_name == "apply": kwargs = {**kwargs, "by_row": False} if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") - if obj._selected_obj.ndim == 1: # For SeriesGroupBy this matches _obj_with_exclusions selected_obj = obj._selected_obj + elif numeric_only: + selected_obj = obj._obj_numeric_only_with_exclusions else: selected_obj = obj._obj_with_exclusions - # Only set as_index=True on groupby objects, not Window or Resample # that inherit from this class. with com.temp_setattr( obj, "as_index", True, condition=hasattr(obj, "as_index") ): - keys, results = self.compute_list_like(op_name, selected_obj, kwargs) + keys, results = self.compute_list_like(op_name, selected_obj, **kwargs) result = self.wrap_results_list_like(keys, results) return result diff --git a/pandas/core/base.py b/pandas/core/base.py index f5eefe1b4ab92..c9bff1fe0bb06 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -219,6 +219,25 @@ def _obj_with_exclusions(self): else: return self.obj + @final + @cache_readonly + def _obj_numeric_only_with_exclusions(self): + if isinstance(self.obj, ABCSeries): + return self.obj.select_dtypes("number") + + if self._selection is not None: + return self.obj[self._selection_list].select_dtypes("number") + + if len(self.exclusions) > 0: + # equivalent to `self.obj.drop(self.exclusions, axis=1) + # but this avoids consolidating and making a copy + # TODO: following GH#45287 can we now use .drop directly without + # making a copy? + obj = self.obj._drop_axis(self.exclusions, axis=1, only_slice=True) + return obj.select_dtypes("number") + else: + return self.obj.select_dtypes("number") + def __getitem__(self, key): if self._selection is not None: raise IndexError(f"Column(s) {self._selection} already selected") diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0a048d11d0b4d..0877f0c9c8a8f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1553,7 +1553,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) else: # try to treat as if we are passing a list - gba = GroupByApply(self, [func], args=(), kwargs={}) + gba = GroupByApply(self, [func], args=args, kwargs=kwargs) try: result = gba.agg() @@ -1582,7 +1582,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) agg = aggregate - def _python_agg_general(self, func, *args, **kwargs): + def _python_agg_general(self, func, *args, numeric_only=False, **kwargs): f = lambda x: func(x, *args, **kwargs) if self.ngroups == 0: @@ -1590,7 +1590,10 @@ def _python_agg_general(self, func, *args, **kwargs): # result dtype in empty case. return self._python_apply_general(f, self._selected_obj, is_agg=True) - obj = self._obj_with_exclusions + if numeric_only: + obj = self._obj_numeric_only_with_exclusions + else: + obj = self._obj_with_exclusions if not len(obj.columns): # e.g. test_margins_no_values_no_cols @@ -1605,19 +1608,19 @@ def _python_agg_general(self, func, *args, **kwargs): res.columns = obj.columns.copy(deep=False) return self._wrap_aggregated_output(res) - def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: + def _aggregate_frame(self, func, *args, numeric_only=False, **kwargs) -> DataFrame: if self._grouper.nkeys != 1: raise AssertionError("Number of keys must be 1") - obj = self._obj_with_exclusions - + mgr = self._get_data_to_aggregate(numeric_only=numeric_only) + data = self._wrap_agged_manager(mgr) result: dict[Hashable, NDFrame | np.ndarray] = {} - for name, grp_df in self._grouper.get_iterator(obj): + for name, grp_df in self._grouper.get_iterator(data): fres = func(grp_df, *args, **kwargs) result[name] = fres result_index = self._grouper.result_index - out = self.obj._constructor(result, index=obj.columns, columns=result_index) + out = self.obj._constructor(result, index=data.columns, columns=result_index) out = out.T return out diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2b9df1b7079da..098b73bbfde7b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1663,3 +1663,53 @@ def func(x): msg = "length must not be 0" with pytest.raises(ValueError, match=msg): df.groupby("A", observed=False).agg(func) + + +@pytest.mark.parametrize( + "aggfunc", + [ + "mean", + np.mean, + ["sum", "mean"], + [np.sum, np.mean], + ["sum", np.mean], + lambda x: x.mean(), + {"A": "mean"}, + {"A": "mean", "B": "sum"}, + {"A": np.mean}, + ], + ids=[ + " string_mean ", + " numpy_mean ", + " list_of_str_and_str ", + " list_of_numpy_and_numpy ", + " list_of_str_and_numpy ", + " lambda ", + " dict_with_str ", + " dict with 2 vars ", + " dict with numpy", + ], +) +@pytest.mark.parametrize( + "groupers", + ["groupby1", "groupby2", ["groupby1", "groupby2"]], + ids=[" 1_grouper_str ", " 1_grouper_int ", " 2_groupers_str_and_int "], +) +@pytest.mark.parametrize( + "numeric_only", [True, None], ids=[" numeric_only True ", " no_numeric_only_arg "] +) # need to add other kwargs +def test_different_combinations_of_groupby_agg(aggfunc, groupers, numeric_only): + df = DataFrame( + { + "A": [1, 2, 3, 4, 5], + "B": [10, 20, 30, 40, 50], + "groupby1": ["diamond", "diamond", "spade", "spade", "spade"], + "groupby2": [1, 1, 1, 2, 2], + "attr": ["a", "b", "c", "d", "e"], + } + ) + if numeric_only or isinstance(aggfunc, dict): + df.groupby(by=groupers).agg(func=aggfunc, numeric_only=numeric_only) + else: + with pytest.raises(TypeError): + df.groupby(by=groupers).agg(func=aggfunc)