ENH: Support nested renaming / selection (#26399)

TomAugspurger · jreback · commit 072408ea8d65 · 2019-05-29T21:29:40.000-04:00
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -568,6 +568,67 @@ For a grouped ``DataFrame``, you can rename in a similar manner:
                             'mean': 'bar',
                             'std': 'baz'}))
 
+.. _groupby.aggregate.named:
+
+Named Aggregation
+~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.25.0
+
+To support column-specific aggregation *with control over the output column names*, pandas
+accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", where
+
+- The keywords are the *output* column names
+- The values are tuples whose first element is the column to select
+  and the second element is the aggregation to apply to that column. Pandas
+  provides the ``pandas.NamedAgg`` namedtuple with the fields ``['column', 'aggfunc']``
+  to make it clearer what the arguments are. As usual, the aggregation can
+  be a callable or a string alias.
+
+.. ipython:: python
+
+   animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
+                           'height': [9.1, 6.0, 9.5, 34.0],
+                           'weight': [7.9, 7.5, 9.9, 198.0]})
+   animals
+
+   animals.groupby("kind").agg(
+       min_height=pd.NamedAgg(column='height', aggfunc='min'),
+       max_height=pd.NamedAgg(column='height', aggfunc='max'),
+       average_weight=pd.NamedAgg(column='height', aggfunc=np.mean),
+   )
+
+
+``pandas.NamedAgg`` is just a ``namedtuple``. Plain tuples are allowed as well.
+
+.. ipython:: python
+
+   animals.groupby("kind").agg(
+       min_height=('height', 'min'),
+       max_height=('height', 'max'),
+       average_weight=('height', np.mean),
+   )
+
+
+If your desired output column names are not valid python keywords, construct a dictionary
+and unpack the keyword arguments
+
+.. ipython:: python
+
+   animals.groupby("kind").agg(**{
+       'total weight': pd.NamedAgg(column='weight', aggfunc=sum),
+   })
+
+Additional keyword arguments are not passed through to the aggregation functions. Only pairs
+of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions
+requires additional arguments, partially apply them with :meth:`functools.partial`.
+
+.. note::
+
+   For Python 3.5 and earlier, the order of ``**kwargs`` in a functions was not
+   preserved. This means that the output column ordering would not be
+   consistent. To ensure consistent ordering, the keys (and so output columns)
+   will always be sorted for Python 3.5.
 
 Applying different functions to DataFrame columns
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -588,19 +649,6 @@ must be either implemented on GroupBy or available via :ref:`dispatching
 
    grouped.agg({'C': 'sum', 'D': 'std'})
 
-.. note::
-
-    If you pass a dict to ``aggregate``, the ordering of the output columns is
-    non-deterministic. If you want to be sure the output columns will be in a specific
-    order, you can use an ``OrderedDict``.  Compare the output of the following two commands:
-
-.. ipython:: python
-
-   from collections import OrderedDict
-
-   grouped.agg({'D': 'std', 'C': 'mean'})
-   grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')]))
-
 .. _groupby.aggregate.cython:
 
 Cython-optimized aggregation functions
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -19,6 +19,47 @@ These are the changes in pandas 0.25.0. See :ref:`release` for a full changelog
 including other versions of pandas.
 
 
+Enhancements
+~~~~~~~~~~~~
+
+.. _whatsnew_0250.enhancements.agg_relabel:
+
+Groupby Aggregation with Relabeling
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Pandas has added special groupby behavior, known as "named aggregation", for naming the
+output columns when applying multiple aggregation functions to specific columns (:issue:`18366`).
+
+.. ipython:: python
+
+   animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
+                           'height': [9.1, 6.0, 9.5, 34.0],
+                           'weight': [7.9, 7.5, 9.9, 198.0]})
+   animals
+   animals.groupby("kind").agg(
+       min_height=pd.NamedAgg(column='height', aggfunc='min'),
+       max_height=pd.NamedAgg(column='height', aggfunc='max'),
+       average_weight=pd.NamedAgg(column='height', aggfunc=np.mean),
+   )
+
+Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs``
+should be tuples where the first element is the column selection, and the second element is the
+aggregation function to apply. Pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer
+what the arguments to the function are, but plain tuples are accepted as well.
+
+.. ipython:: python
+
+   animals.groupby("kind").agg(
+       min_height=('height', 'min'),
+       max_height=('height', 'max'),
+       average_weight=('height', np.mean),
+   )
+
+Named aggregation is the recommended replacement for the deprecated "dict-of-dicts"
+approach to naming the output of column-specific aggregations (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`).
+
+See :ref:`_groupby.aggregate.named` for more.
+
 .. _whatsnew_0250.enhancements.other:
 
 Other Enhancements
diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -65,7 +65,7 @@
     to_numeric, to_datetime, to_timedelta,
 
     # misc
-    np, Grouper, factorize, unique, value_counts,
+    np, Grouper, factorize, unique, value_counts, NamedAgg,
     array, Categorical, set_eng_float_format, Series, DataFrame,
     Panel)
 
diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -21,7 +21,7 @@
     DatetimeTZDtype,
 )
 from pandas.core.arrays import Categorical, array
-from pandas.core.groupby import Grouper
+from pandas.core.groupby import Grouper, NamedAgg
 from pandas.io.formats.format import set_eng_float_format
 from pandas.core.index import (Index, CategoricalIndex, Int64Index,
                                UInt64Index, RangeIndex, Float64Index,
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -340,11 +340,15 @@ def _aggregate(self, arg, *args, **kwargs):
             def nested_renaming_depr(level=4):
                 # deprecation of nested renaming
                 # GH 15931
-                warnings.warn(
-                    ("using a dict with renaming "
-                     "is deprecated and will be removed in a future "
-                     "version"),
-                    FutureWarning, stacklevel=level)
+                msg = textwrap.dedent("""\
+                using a dict with renaming is deprecated and will be removed
+                in a future version.
+
+                For column-specific groupby renaming, use named aggregation
+
+                    >>> df.groupby(...).agg(name=('column', aggfunc))
+                """)
+                warnings.warn(msg, FutureWarning, stacklevel=level)
 
             # if we have a dict of any non-scalars
             # eg. {'A' : ['mean']}, normalize all to
diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py
@@ -1,4 +1,4 @@
-from pandas.core.groupby.groupby import GroupBy  # noqa: F401
 from pandas.core.groupby.generic import (  # noqa: F401
-    SeriesGroupBy, DataFrameGroupBy)
+    DataFrameGroupBy, NamedAgg, SeriesGroupBy)
+from pandas.core.groupby.groupby import GroupBy  # noqa: F401
 from pandas.core.groupby.grouper import Grouper  # noqa: F401
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -6,15 +6,18 @@
 which here returns a DataFrameGroupBy object.
 """
 
-from collections import OrderedDict, abc
+from collections import OrderedDict, abc, namedtuple
 import copy
 from functools import partial
 from textwrap import dedent
+import typing
+from typing import Any, Callable, List, Union
 import warnings
 
 import numpy as np
 
 from pandas._libs import Timestamp, lib
+from pandas.compat import PY36
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import Appender, Substitution
 
@@ -41,6 +44,10 @@
 
 from pandas.plotting._core import boxplot_frame_groupby
 
+NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
+# TODO(typing) the return value on this callable should be any *scalar*.
+AggScalar = Union[str, Callable[..., Any]]
+
 
 class NDFrameGroupBy(GroupBy):
 
@@ -144,8 +151,18 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True,
         return new_items, new_blocks
 
     def aggregate(self, func, *args, **kwargs):
-
         _level = kwargs.pop('_level', None)
+
+        relabeling = func is None and _is_multi_agg_with_relabel(**kwargs)
+        if relabeling:
+            func, columns, order = _normalize_keyword_aggregation(kwargs)
+
+            kwargs = {}
+        elif func is None:
+            # nicer error message
+            raise TypeError("Must provide 'func' or tuples of "
+                            "'(column, aggfunc).")
+
         result, how = self._aggregate(func, _level=_level, *args, **kwargs)
         if how is None:
             return result
@@ -179,6 +196,10 @@ def aggregate(self, func, *args, **kwargs):
             self._insert_inaxis_grouper_inplace(result)
             result.index = np.arange(len(result))
 
+        if relabeling:
+            result = result[order]
+            result.columns = columns
+
         return result._convert(datetime=True)
 
     agg = aggregate
@@ -791,11 +812,8 @@ def _aggregate_multiple_funcs(self, arg, _level):
             # list of functions / function names
             columns = []
             for f in arg:
-                if isinstance(f, str):
-                    columns.append(f)
-                else:
-                    # protect against callables without names
-                    columns.append(com.get_callable_name(f))
+                columns.append(com.get_callable_name(f) or f)
+
             arg = zip(columns, arg)
 
         results = OrderedDict()
@@ -1296,6 +1314,26 @@ class DataFrameGroupBy(NDFrameGroupBy):
     A
     1   1   2  0.590716
     2   3   4  0.704907
+
+    To control the output names with different aggregations per column,
+    pandas supports "named aggregation"
+
+    >>> df.groupby("A").agg(
+    ...     b_min=pd.NamedAgg(column="B", aggfunc="min"),
+    ...     c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
+       b_min     c_sum
+    A
+    1      1 -1.956929
+    2      3 -0.322183
+
+    - The keywords are the *output* column names
+    - The values are tuples whose first element is the column to select
+      and the second element is the aggregation to apply to that column.
+      Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
+      ``['column', 'aggfunc']`` to make it clearer what the arguments are.
+      As usual, the aggregation can be a callable or a string alias.
+
+    See :ref:`groupby.aggregate.named` for more.
     """)
 
     @Substitution(see_also=_agg_see_also_doc,
@@ -1304,7 +1342,7 @@ class DataFrameGroupBy(NDFrameGroupBy):
                   klass='DataFrame',
                   axis='')
     @Appender(_shared_docs['aggregate'])
-    def aggregate(self, arg, *args, **kwargs):
+    def aggregate(self, arg=None, *args, **kwargs):
         return super().aggregate(arg, *args, **kwargs)
 
     agg = aggregate
@@ -1577,3 +1615,77 @@ def groupby_series(obj, col=None):
         return results
 
     boxplot = boxplot_frame_groupby
+
+
+def _is_multi_agg_with_relabel(**kwargs):
+    """
+    Check whether the kwargs pass to .agg look like multi-agg with relabling.
+
+    Parameters
+    ----------
+    **kwargs : dict
+
+    Returns
+    -------
+    bool
+
+    Examples
+    --------
+    >>> _is_multi_agg_with_relabel(a='max')
+    False
+    >>> _is_multi_agg_with_relabel(a_max=('a', 'max'),
+    ...                            a_min=('a', 'min'))
+    True
+    >>> _is_multi_agg_with_relabel()
+    False
+    """
+    return all(
+        isinstance(v, tuple) and len(v) == 2
+        for v in kwargs.values()
+    ) and kwargs
+
+
+def _normalize_keyword_aggregation(kwargs):
+    """
+    Normalize user-provided "named aggregation" kwargs.
+
+    Transforms from the new ``Dict[str, NamedAgg]`` style kwargs
+    to the old OrderedDict[str, List[scalar]]].
+
+    Parameters
+    ----------
+    kwargs : dict
+
+    Returns
+    -------
+    aggspec : dict
+        The transformed kwargs.
+    columns : List[str]
+        The user-provided keys.
+    order : List[Tuple[str, str]]
+        Pairs of the input and output column names.
+
+    Examples
+    --------
+    >>> _normalize_keyword_aggregation({'output': ('input', 'sum')})
+    (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')])
+    """
+    if not PY36:
+        kwargs = OrderedDict(sorted(kwargs.items()))
+
+    # Normalize the aggregation functions as Dict[column, List[func]],
+    # process normally, then fixup the names.
+    # TODO(Py35): When we drop python 3.5, change this to
+    # defaultdict(list)
+    aggspec = OrderedDict()  # type: typing.OrderedDict[str, List[AggScalar]]
+    order = []
+    columns, pairs = list(zip(*kwargs.items()))
+
+    for name, (column, aggfunc) in zip(columns, pairs):
+        if column in aggspec:
+            aggspec[column].append(aggfunc)
+        else:
+            aggspec[column] = [aggfunc]
+        order.append((column,
+                      com.get_callable_name(aggfunc) or aggfunc))
+    return aggspec, columns, order
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
@@ -47,6 +47,7 @@ class TestPDApi(Base):
                'DatetimeTZDtype',
                'Int8Dtype', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype',
                'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype',
+               'NamedAgg',
                ]
 
     # these are already deprecated; awaiting removal
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`	`DatetimeTZDtype,`
`22`	`22`	`)`
`23`	`23`	`from pandas.core.arrays import Categorical, array`
`24`		`-from pandas.core.groupby import Grouper`
	`24`	`+from pandas.core.groupby import Grouper, NamedAgg`
`25`	`25`	`from pandas.io.formats.format import set_eng_float_format`
`26`	`26`	`from pandas.core.index import (Index, CategoricalIndex, Int64Index,`
`27`	`27`	`UInt64Index, RangeIndex, Float64Index,`
Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,7 @@ class TestPDApi(Base):`
`47`	`47`	`'DatetimeTZDtype',`
`48`	`48`	`'Int8Dtype', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype',`
`49`	`49`	`'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype',`
	`50`	`+ 'NamedAgg',`
`50`	`51`	`]`
`51`	`52`
`52`	`53`	`# these are already deprecated; awaiting removal`