pandas-dev · jreback · May 30, 2019 · May 13, 2019 · May 15, 2019 · May 15, 2019
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -601,6 +601,30 @@ must be either implemented on GroupBy or available via :ref:`dispatching
    grouped.agg({'D': 'std', 'C': 'mean'})
    grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')]))
 
+.. versionadded:: 0.25.0
+
+To support column-specific aggregation with control over the output column names, pandas
+accepts the special syntax where
+
+1. The keywords are the *output* column names
+2. The first element of each tuple is the column to select
+3. The second element of each tuple is the aggregation function to apply to that column.
+
+.. ipython:: python
+
+   grouped.agg(d_std=('D', 'std'), c_mean=('C', 'mean'))
+
+If your desired output column names are not valid python keywords, construct a dictionary
+and unpack the keyword arguments
+
+.. ipython:: python
+
+   grouped.agg(**{'d_std': ('D', 'std'), 'mean of C': ('C', 'mean')})
+
+Additional keyword arguments are not passed through to the aggregation functions. Only pairs
+of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions
+requires additional arguments, partially apply them with :meth:`functools.partial`.
+
 .. _groupby.aggregate.cython:
 
 Cython-optimized aggregation functions

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -19,6 +19,29 @@ These are the changes in pandas 0.25.0. See :ref:`release` for a full changelog
 including other versions of pandas.
 
 
+Enhancements
+~~~~~~~~~~~~
+
+.. _whatsnew_0250.enhancements.agg_relabel:
+
+Groupby Aggregation with Relabling
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Pandas has added special syntax for naming the output columns when applying multiple aggreation functions to specific
+columns.
+
+.. ipython:: python
+
+   df = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
+                      'height': [9.1, 6.0, 9.5, 34.0],
+                      'weight': [7.9, 7.5, 9.9, 198.0]})
+   grouper = df.groupby("kind")
+   grouper.agg(max_height=('height', 'max'), average_weight=('weight', 'mean'))
+
+Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs``
+should be tuples where the first element is the column selection, and the second element is the
+aggregation function to apply.
+
 .. _whatsnew_0250.enhancements.other:
 
 Other Enhancements

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -144,8 +144,30 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True,
         return new_items, new_blocks
 
     def aggregate(self, func, *args, **kwargs):
-
         _level = kwargs.pop('_level', None)
+
+        relabeling = func is None and _is_multi_agg_with_relabel(**kwargs)
+        if relabeling:
+            # Normalize the aggregation functions as Dict[column, List[func]],
+            # process normally, then fixup the names.
+            # TODO(Py35): When we drop python 3.5, change this to
+            # defaultdict(list)
+            func = OrderedDict()
+            order = []
+            columns, pairs = list(zip(*kwargs.items()))
+
+            for i, (name, (column, aggfunc)) in enumerate(zip(columns, pairs)):
+                if column in func:
+                    func[column].append(aggfunc)
+                else:
+                    func[column] = [aggfunc]
+                order.append((column, _get_agg_name(aggfunc)))
+            kwargs = {}
+        elif func is None:
+            # nicer error message
+            raise TypeError("Must provide 'func' or tuples of "
+                            "'(column, aggfunc).")
+
         result, how = self._aggregate(func, _level=_level, *args, **kwargs)
         if how is None:
             return result
@@ -179,6 +201,10 @@ def aggregate(self, func, *args, **kwargs):
             self._insert_inaxis_grouper_inplace(result)
             result.index = np.arange(len(result))
 
+        if relabeling:
+            result = result[order]
+            result.columns = columns
+
         return result._convert(datetime=True)
 
     agg = aggregate
@@ -791,11 +817,8 @@ def _aggregate_multiple_funcs(self, arg, _level):
             # list of functions / function names
             columns = []
             for f in arg:
-                if isinstance(f, str):
-                    columns.append(f)
-                else:
-                    # protect against callables without names
-                    columns.append(com.get_callable_name(f))
+                columns.append(_get_agg_name(f))
+
             arg = zip(columns, arg)
 
         results = OrderedDict()
@@ -1292,6 +1315,16 @@ class DataFrameGroupBy(NDFrameGroupBy):
     A
     1   1   2  0.590716
     2   3   4  0.704907
+
+    To control the output names with different aggregations
+    per column, pass tuples of ``(column, aggfunc))`` as kwargs
+
+    >>> df.groupby("A").agg(b_min=("B", "min"), c_sum=("C", "sum"))
+    >>>
+           b_min     c_sum
+    A
+    1      1  0.825627
+    2      3  2.218618
     """)
 
     @Substitution(see_also=_agg_see_also_doc,
@@ -1300,7 +1333,7 @@ class DataFrameGroupBy(NDFrameGroupBy):
                   klass='DataFrame',
                   axis='')
     @Appender(_shared_docs['aggregate'])
-    def aggregate(self, arg, *args, **kwargs):
+    def aggregate(self, arg=None, *args, **kwargs):
         return super().aggregate(arg, *args, **kwargs)
 
     agg = aggregate
@@ -1573,3 +1606,48 @@ def groupby_series(obj, col=None):
         return results
 
     boxplot = boxplot_frame_groupby
+
+
+def _is_multi_agg_with_relabel(**kwargs):
+    """
+    Check whether the kwargs pass to .agg look like multi-agg with relabling.
+
+    Parameters
+    ----------
+    **kwargs : dict
+
+    Returns
+    -------
+    bool
+
+    Examples
+    --------
+    >>> _is_multi_agg_with_relabel(a='max')
+    False
+    >>> _is_multi_agg_with_relabel(a_max=('a', 'max'),
+    ...                            a_min=('a', 'min'))
+    True
+    >>> _is_multi_agg_with_relabel()
+    """
+    return all(
+        isinstance(v, tuple) and len(v) == 2
+        for v in kwargs.values()
+    ) and kwargs
+
+
+def _get_agg_name(arg):
+    """
+
+    Parameters
+    ----------
+    arg
+
+    Returns
+    -------
+
+    """
+    if isinstance(arg, str):
+        return arg
+    else:
+        # protect against callables without names
+        return com.get_callable_name(arg)
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -313,3 +313,80 @@ def test_order_aggregate_multiple_funcs():
     expected = pd.Index(['sum', 'max', 'mean', 'ohlc', 'min'])
 
     tm.assert_index_equal(result, expected)
+
+
+def test_agg_relabel():
+    df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'],
+                       "A": [0, 1, 2, 3],
+                       "B": [5, 6, 7, 8]})
+    result = df.groupby("group").agg(
+        a_max=("A", "max"),
+        b_max=("B", "max"),
+    )
+    expected = pd.DataFrame({
+        "a_max": [1, 3],
+        "b_max": [6, 8],
+    }, index=pd.Index(['a', 'b'], name='group'))
+    tm.assert_frame_equal(result, expected)
+
+    # order invariance
+    result = df.groupby('group').agg(
+        b_min=("B", "min"),
+        a_min=("A", min),
+        a_max=("A", "max"),
+        b_max=("B", "max"),
+    )
+    expected = pd.DataFrame({"b_min": [5, 7],
+                             "a_min": [0, 2],
+                             "a_max": [1, 3],
+                             "b_max": [6, 8]},
+                            index=pd.Index(['a', 'b'], name='group'),
+                            columns=['b_min', 'a_min', 'a_max', 'b_max'])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_agg_relabel_non_identifier():
+    df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'],
+                       "A": [0, 1, 2, 3],
+                       "B": [5, 6, 7, 8]})
+
+    result = df.groupby("group").agg(**{'my col': ('A', 'max')})
+    expected = pd.DataFrame({'my col': [1, 3]},
+                            index=pd.Index(['a', 'b'], name='group'))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_duplicate_raises():
+    # TODO: we currently raise on multiple lambdas. We could *maybe*
+    # update com.get_callable_name to append `_i` to each lambda.
+    df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
+    with pytest.raises(SpecificationError, match="Function names"):
+        df.groupby("A").agg(a=("A", "min"), b=("A", "min"))
+
+
+def test_agg_relabel_with_level():
+    df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]},
+                      index=pd.MultiIndex.from_product([['A', 'B'],
+                                                        ['a', 'b']]))
+    result = df.groupby(level=0).agg(aa=('A', 'max'), bb=('A', 'min'),
+                                     cc=('B', 'mean'))
+    expected = pd.DataFrame({
+        'aa': [0, 1],
+        'bb': [0, 1],
+        'cc': [1.5, 3.5]
+    }, index=['A', 'B'])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_agg_relabel_other_raises():
+    df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]})
+    grouped = df.groupby("A")
+    match = 'Must provide'
+    with pytest.raises(TypeError, match=match):
+        grouped.agg(foo=1)
+
+    with pytest.raises(TypeError, match=match):
+        grouped.agg()
+
+    with pytest.raises(TypeError, match=match):
+        grouped.agg(a=('B', 'max'), b=(1, 2, 3))