diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 2014dbd9865f3..9895fc606f70d 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -595,7 +595,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", animals.groupby("kind").agg( min_height=pd.NamedAgg(column='height', aggfunc='min'), max_height=pd.NamedAgg(column='height', aggfunc='max'), - average_weight=pd.NamedAgg(column='height', aggfunc=np.mean), + average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean), ) @@ -606,7 +606,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", animals.groupby("kind").agg( min_height=('height', 'min'), max_height=('height', 'max'), - average_weight=('height', np.mean), + average_weight=('weight', np.mean), ) @@ -630,6 +630,16 @@ requires additional arguments, partially apply them with :meth:`functools.partia consistent. To ensure consistent ordering, the keys (and so output columns) will always be sorted for Python 3.5. +Named aggregation is also valid for Series groupby aggregations. In this case there's +no column selection, so the values are just the functions. + +.. ipython:: python + + animals.groupby("kind").height.agg( + min_height='min', + max_height='max', + ) + Applying different functions to DataFrame columns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index df22a21196dab..9cc44a91cf72b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -28,7 +28,7 @@ Groupby Aggregation with Relabeling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pandas has added special groupby behavior, known as "named aggregation", for naming the -output columns when applying multiple aggregation functions to specific columns (:issue:`18366`). +output columns when applying multiple aggregation functions to specific columns (:issue:`18366`, :issue:`26512`). .. ipython:: python @@ -39,7 +39,7 @@ output columns when applying multiple aggregation functions to specific columns animals.groupby("kind").agg( min_height=pd.NamedAgg(column='height', aggfunc='min'), max_height=pd.NamedAgg(column='height', aggfunc='max'), - average_weight=pd.NamedAgg(column='height', aggfunc=np.mean), + average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean), ) Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` @@ -52,12 +52,26 @@ what the arguments to the function are, but plain tuples are accepted as well. animals.groupby("kind").agg( min_height=('height', 'min'), max_height=('height', 'max'), - average_weight=('height', np.mean), + average_weight=('weight', np.mean), ) Named aggregation is the recommended replacement for the deprecated "dict-of-dicts" approach to naming the output of column-specific aggregations (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). +A similar approach is now available for Series groupby objects as well. Because there's no need for +column selection, the values can just be the functions to apply + +.. ipython:: python + + animals.groupby("kind").height.agg( + min_height="min", + max_height="max", + ) + + +This type of aggregation is the recommended alternative to the deprecated behavior when passing +a dict to a Series groupby aggregation (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). + See :ref:`_groupby.aggregate.named` for more. .. _whatsnew_0250.enhancements.other: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9ded2450693dd..57d14cb4c15d7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -735,6 +735,17 @@ def _selection_name(self): min max 1 1 2 2 3 4 + + The output column names can be controlled by passing + the desired column names and aggregations as keyword arguments. + + >>> s.groupby([1, 1, 2, 2]).agg( + ... minimum='min', + ... maximum='max', + ... ) + minimum maximum + 1 1 2 + 2 3 4 """) @Appender(_apply_docs['template'] @@ -749,8 +760,24 @@ def apply(self, func, *args, **kwargs): klass='Series', axis='') @Appender(_shared_docs['aggregate']) - def aggregate(self, func_or_funcs, *args, **kwargs): + def aggregate(self, func_or_funcs=None, *args, **kwargs): _level = kwargs.pop('_level', None) + + relabeling = func_or_funcs is None + columns = None + no_arg_message = ("Must provide 'func_or_funcs' or named " + "aggregation **kwargs.") + if relabeling: + columns = list(kwargs) + if not PY36: + # sort for 3.5 and earlier + columns = list(sorted(columns)) + + func_or_funcs = [kwargs[col] for col in columns] + kwargs = {} + if not columns: + raise TypeError(no_arg_message) + if isinstance(func_or_funcs, str): return getattr(self, func_or_funcs)(*args, **kwargs) @@ -759,6 +786,8 @@ def aggregate(self, func_or_funcs, *args, **kwargs): # but not the class list / tuple itself. ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) + if relabeling: + ret.columns = columns else: cyfunc = self._is_cython_func(func_or_funcs) if cyfunc and not args and not kwargs: @@ -793,11 +822,14 @@ def _aggregate_multiple_funcs(self, arg, _level): # have not shown a higher level one # GH 15931 if isinstance(self._selected_obj, Series) and _level <= 1: - warnings.warn( - ("using a dict on a Series for aggregation\n" - "is deprecated and will be removed in a future " - "version"), - FutureWarning, stacklevel=3) + msg = dedent("""\ + using a dict on a Series for aggregation + is deprecated and will be removed in a future version. Use \ + named aggregation instead. + + >>> grouper.agg(name_1=func_1, name_2=func_2) + """) + warnings.warn(msg, FutureWarning, stacklevel=3) columns = list(arg.keys()) arg = arg.items() @@ -1562,7 +1594,7 @@ def groupby_series(obj, col=None): def _is_multi_agg_with_relabel(**kwargs): """ - Check whether the kwargs pass to .agg look like multi-agg with relabling. + Check whether kwargs passed to .agg look like multi-agg with relabeling. Parameters ---------- diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 9e714a1086037..801b99fed5ce6 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -329,8 +329,41 @@ def test_uint64_type_handling(dtype, how): tm.assert_frame_equal(result, expected, check_exact=True) -class TestNamedAggregation: +class TestNamedAggregationSeries: + + def test_series_named_agg(self): + df = pd.Series([1, 2, 3, 4]) + gr = df.groupby([0, 0, 1, 1]) + result = gr.agg(a='sum', b='min') + expected = pd.DataFrame({'a': [3, 7], 'b': [1, 3]}, + columns=['a', 'b'], index=[0, 1]) + tm.assert_frame_equal(result, expected) + + result = gr.agg(b='min', a='sum') + # sort for 35 and earlier + if compat.PY36: + expected = expected[['b', 'a']] + tm.assert_frame_equal(result, expected) + + def test_no_args_raises(self): + gr = pd.Series([1, 2]).groupby([0, 1]) + with pytest.raises(TypeError, match='Must provide'): + gr.agg() + + # but we do allow this + result = gr.agg([]) + expected = pd.DataFrame() + tm.assert_frame_equal(result, expected) + + def test_series_named_agg_duplicates_raises(self): + # This is a limitation of the named agg implementation reusing + # aggregate_multiple_funcs. It could maybe be lifted in the future. + gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) + with pytest.raises(SpecificationError): + gr.agg(a='sum', b='sum') + +class TestNamedAggregationDataFrame: def test_agg_relabel(self): df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], "A": [0, 1, 2, 3], diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 8168cf06ffdb1..a061eaa1a2c6f 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -225,6 +225,7 @@ def test_agg_dict_renaming_deprecation(): with tm.assert_produces_warning(FutureWarning) as w: df.groupby('A').B.agg({'foo': 'count'}) assert "using a dict on a Series for aggregation" in str(w[0].message) + assert "named aggregation instead." in str(w[0].message) def test_agg_compat():