Skip to content

Commit 5574a9f

Browse files
TomAugspurgerjreback
authored andcommitted
ENH: Named aggregation in SeriesGroupBy.agg (#26580)
1 parent 5cab4d6 commit 5574a9f

File tree

5 files changed

+103
-13
lines changed

5 files changed

+103
-13
lines changed

doc/source/user_guide/groupby.rst

+12-2
Original file line numberDiff line numberDiff line change
@@ -595,7 +595,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation",
595595
animals.groupby("kind").agg(
596596
min_height=pd.NamedAgg(column='height', aggfunc='min'),
597597
max_height=pd.NamedAgg(column='height', aggfunc='max'),
598-
average_weight=pd.NamedAgg(column='height', aggfunc=np.mean),
598+
average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean),
599599
)
600600
601601
@@ -606,7 +606,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation",
606606
animals.groupby("kind").agg(
607607
min_height=('height', 'min'),
608608
max_height=('height', 'max'),
609-
average_weight=('height', np.mean),
609+
average_weight=('weight', np.mean),
610610
)
611611
612612
@@ -630,6 +630,16 @@ requires additional arguments, partially apply them with :meth:`functools.partia
630630
consistent. To ensure consistent ordering, the keys (and so output columns)
631631
will always be sorted for Python 3.5.
632632

633+
Named aggregation is also valid for Series groupby aggregations. In this case there's
634+
no column selection, so the values are just the functions.
635+
636+
.. ipython:: python
637+
638+
animals.groupby("kind").height.agg(
639+
min_height='min',
640+
max_height='max',
641+
)
642+
633643
Applying different functions to DataFrame columns
634644
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
635645

doc/source/whatsnew/v0.25.0.rst

+17-3
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Groupby Aggregation with Relabeling
2828
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2929

3030
Pandas has added special groupby behavior, known as "named aggregation", for naming the
31-
output columns when applying multiple aggregation functions to specific columns (:issue:`18366`).
31+
output columns when applying multiple aggregation functions to specific columns (:issue:`18366`, :issue:`26512`).
3232

3333
.. ipython:: python
3434
@@ -39,7 +39,7 @@ output columns when applying multiple aggregation functions to specific columns
3939
animals.groupby("kind").agg(
4040
min_height=pd.NamedAgg(column='height', aggfunc='min'),
4141
max_height=pd.NamedAgg(column='height', aggfunc='max'),
42-
average_weight=pd.NamedAgg(column='height', aggfunc=np.mean),
42+
average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean),
4343
)
4444
4545
Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs``
@@ -52,12 +52,26 @@ what the arguments to the function are, but plain tuples are accepted as well.
5252
animals.groupby("kind").agg(
5353
min_height=('height', 'min'),
5454
max_height=('height', 'max'),
55-
average_weight=('height', np.mean),
55+
average_weight=('weight', np.mean),
5656
)
5757
5858
Named aggregation is the recommended replacement for the deprecated "dict-of-dicts"
5959
approach to naming the output of column-specific aggregations (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`).
6060

61+
A similar approach is now available for Series groupby objects as well. Because there's no need for
62+
column selection, the values can just be the functions to apply
63+
64+
.. ipython:: python
65+
66+
animals.groupby("kind").height.agg(
67+
min_height="min",
68+
max_height="max",
69+
)
70+
71+
72+
This type of aggregation is the recommended alternative to the deprecated behavior when passing
73+
a dict to a Series groupby aggregation (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`).
74+
6175
See :ref:`_groupby.aggregate.named` for more.
6276

6377
.. _whatsnew_0250.enhancements.other:

pandas/core/groupby/generic.py

+39-7
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,17 @@ def _selection_name(self):
735735
min max
736736
1 1 2
737737
2 3 4
738+
739+
The output column names can be controlled by passing
740+
the desired column names and aggregations as keyword arguments.
741+
742+
>>> s.groupby([1, 1, 2, 2]).agg(
743+
... minimum='min',
744+
... maximum='max',
745+
... )
746+
minimum maximum
747+
1 1 2
748+
2 3 4
738749
""")
739750

740751
@Appender(_apply_docs['template']
@@ -749,8 +760,24 @@ def apply(self, func, *args, **kwargs):
749760
klass='Series',
750761
axis='')
751762
@Appender(_shared_docs['aggregate'])
752-
def aggregate(self, func_or_funcs, *args, **kwargs):
763+
def aggregate(self, func_or_funcs=None, *args, **kwargs):
753764
_level = kwargs.pop('_level', None)
765+
766+
relabeling = func_or_funcs is None
767+
columns = None
768+
no_arg_message = ("Must provide 'func_or_funcs' or named "
769+
"aggregation **kwargs.")
770+
if relabeling:
771+
columns = list(kwargs)
772+
if not PY36:
773+
# sort for 3.5 and earlier
774+
columns = list(sorted(columns))
775+
776+
func_or_funcs = [kwargs[col] for col in columns]
777+
kwargs = {}
778+
if not columns:
779+
raise TypeError(no_arg_message)
780+
754781
if isinstance(func_or_funcs, str):
755782
return getattr(self, func_or_funcs)(*args, **kwargs)
756783

@@ -759,6 +786,8 @@ def aggregate(self, func_or_funcs, *args, **kwargs):
759786
# but not the class list / tuple itself.
760787
ret = self._aggregate_multiple_funcs(func_or_funcs,
761788
(_level or 0) + 1)
789+
if relabeling:
790+
ret.columns = columns
762791
else:
763792
cyfunc = self._is_cython_func(func_or_funcs)
764793
if cyfunc and not args and not kwargs:
@@ -793,11 +822,14 @@ def _aggregate_multiple_funcs(self, arg, _level):
793822
# have not shown a higher level one
794823
# GH 15931
795824
if isinstance(self._selected_obj, Series) and _level <= 1:
796-
warnings.warn(
797-
("using a dict on a Series for aggregation\n"
798-
"is deprecated and will be removed in a future "
799-
"version"),
800-
FutureWarning, stacklevel=3)
825+
msg = dedent("""\
826+
using a dict on a Series for aggregation
827+
is deprecated and will be removed in a future version. Use \
828+
named aggregation instead.
829+
830+
>>> grouper.agg(name_1=func_1, name_2=func_2)
831+
""")
832+
warnings.warn(msg, FutureWarning, stacklevel=3)
801833

802834
columns = list(arg.keys())
803835
arg = arg.items()
@@ -1562,7 +1594,7 @@ def groupby_series(obj, col=None):
15621594

15631595
def _is_multi_agg_with_relabel(**kwargs):
15641596
"""
1565-
Check whether the kwargs pass to .agg look like multi-agg with relabling.
1597+
Check whether kwargs passed to .agg look like multi-agg with relabeling.
15661598
15671599
Parameters
15681600
----------

pandas/tests/groupby/aggregate/test_aggregate.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -329,8 +329,41 @@ def test_uint64_type_handling(dtype, how):
329329
tm.assert_frame_equal(result, expected, check_exact=True)
330330

331331

332-
class TestNamedAggregation:
332+
class TestNamedAggregationSeries:
333+
334+
def test_series_named_agg(self):
335+
df = pd.Series([1, 2, 3, 4])
336+
gr = df.groupby([0, 0, 1, 1])
337+
result = gr.agg(a='sum', b='min')
338+
expected = pd.DataFrame({'a': [3, 7], 'b': [1, 3]},
339+
columns=['a', 'b'], index=[0, 1])
340+
tm.assert_frame_equal(result, expected)
341+
342+
result = gr.agg(b='min', a='sum')
343+
# sort for 35 and earlier
344+
if compat.PY36:
345+
expected = expected[['b', 'a']]
346+
tm.assert_frame_equal(result, expected)
347+
348+
def test_no_args_raises(self):
349+
gr = pd.Series([1, 2]).groupby([0, 1])
350+
with pytest.raises(TypeError, match='Must provide'):
351+
gr.agg()
352+
353+
# but we do allow this
354+
result = gr.agg([])
355+
expected = pd.DataFrame()
356+
tm.assert_frame_equal(result, expected)
357+
358+
def test_series_named_agg_duplicates_raises(self):
359+
# This is a limitation of the named agg implementation reusing
360+
# aggregate_multiple_funcs. It could maybe be lifted in the future.
361+
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
362+
with pytest.raises(SpecificationError):
363+
gr.agg(a='sum', b='sum')
364+
333365

366+
class TestNamedAggregationDataFrame:
334367
def test_agg_relabel(self):
335368
df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'],
336369
"A": [0, 1, 2, 3],

pandas/tests/groupby/aggregate/test_other.py

+1
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ def test_agg_dict_renaming_deprecation():
225225
with tm.assert_produces_warning(FutureWarning) as w:
226226
df.groupby('A').B.agg({'foo': 'count'})
227227
assert "using a dict on a Series for aggregation" in str(w[0].message)
228+
assert "named aggregation instead." in str(w[0].message)
228229

229230

230231
def test_agg_compat():

0 commit comments

Comments
 (0)