diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 4f116a42253e5..2014dbd9865f3 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -568,6 +568,67 @@ For a grouped ``DataFrame``, you can rename in a similar manner: 'mean': 'bar', 'std': 'baz'})) +.. _groupby.aggregate.named: + +Named Aggregation +~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.25.0 + +To support column-specific aggregation *with control over the output column names*, pandas +accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", where + +- The keywords are the *output* column names +- The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. Pandas + provides the ``pandas.NamedAgg`` namedtuple with the fields ``['column', 'aggfunc']`` + to make it clearer what the arguments are. As usual, the aggregation can + be a callable or a string alias. + +.. ipython:: python + + animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], + 'height': [9.1, 6.0, 9.5, 34.0], + 'weight': [7.9, 7.5, 9.9, 198.0]}) + animals + + animals.groupby("kind").agg( + min_height=pd.NamedAgg(column='height', aggfunc='min'), + max_height=pd.NamedAgg(column='height', aggfunc='max'), + average_weight=pd.NamedAgg(column='height', aggfunc=np.mean), + ) + + +``pandas.NamedAgg`` is just a ``namedtuple``. Plain tuples are allowed as well. + +.. ipython:: python + + animals.groupby("kind").agg( + min_height=('height', 'min'), + max_height=('height', 'max'), + average_weight=('height', np.mean), + ) + + +If your desired output column names are not valid python keywords, construct a dictionary +and unpack the keyword arguments + +.. ipython:: python + + animals.groupby("kind").agg(**{ + 'total weight': pd.NamedAgg(column='weight', aggfunc=sum), + }) + +Additional keyword arguments are not passed through to the aggregation functions. Only pairs +of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions +requires additional arguments, partially apply them with :meth:`functools.partial`. + +.. note:: + + For Python 3.5 and earlier, the order of ``**kwargs`` in a functions was not + preserved. This means that the output column ordering would not be + consistent. To ensure consistent ordering, the keys (and so output columns) + will always be sorted for Python 3.5. Applying different functions to DataFrame columns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -588,19 +649,6 @@ must be either implemented on GroupBy or available via :ref:`dispatching grouped.agg({'C': 'sum', 'D': 'std'}) -.. note:: - - If you pass a dict to ``aggregate``, the ordering of the output columns is - non-deterministic. If you want to be sure the output columns will be in a specific - order, you can use an ``OrderedDict``. Compare the output of the following two commands: - -.. ipython:: python - - from collections import OrderedDict - - grouped.agg({'D': 'std', 'C': 'mean'}) - grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')])) - .. _groupby.aggregate.cython: Cython-optimized aggregation functions diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2c66d3e4db321..96837916f815b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -19,6 +19,47 @@ These are the changes in pandas 0.25.0. See :ref:`release` for a full changelog including other versions of pandas. +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_0250.enhancements.agg_relabel: + +Groupby Aggregation with Relabeling +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas has added special groupby behavior, known as "named aggregation", for naming the +output columns when applying multiple aggregation functions to specific columns (:issue:`18366`). + +.. ipython:: python + + animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], + 'height': [9.1, 6.0, 9.5, 34.0], + 'weight': [7.9, 7.5, 9.9, 198.0]}) + animals + animals.groupby("kind").agg( + min_height=pd.NamedAgg(column='height', aggfunc='min'), + max_height=pd.NamedAgg(column='height', aggfunc='max'), + average_weight=pd.NamedAgg(column='height', aggfunc=np.mean), + ) + +Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` +should be tuples where the first element is the column selection, and the second element is the +aggregation function to apply. Pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer +what the arguments to the function are, but plain tuples are accepted as well. + +.. ipython:: python + + animals.groupby("kind").agg( + min_height=('height', 'min'), + max_height=('height', 'max'), + average_weight=('height', np.mean), + ) + +Named aggregation is the recommended replacement for the deprecated "dict-of-dicts" +approach to naming the output of column-specific aggregations (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). + +See :ref:`_groupby.aggregate.named` for more. + .. _whatsnew_0250.enhancements.other: Other Enhancements diff --git a/pandas/__init__.py b/pandas/__init__.py index 6af6f3093c120..4c494b4a62e39 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -65,7 +65,7 @@ to_numeric, to_datetime, to_timedelta, # misc - np, Grouper, factorize, unique, value_counts, + np, Grouper, factorize, unique, value_counts, NamedAgg, array, Categorical, set_eng_float_format, Series, DataFrame, Panel) diff --git a/pandas/core/api.py b/pandas/core/api.py index b7398e433f28f..0106feabcce74 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -21,7 +21,7 @@ DatetimeTZDtype, ) from pandas.core.arrays import Categorical, array -from pandas.core.groupby import Grouper +from pandas.core.groupby import Grouper, NamedAgg from pandas.io.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex, Float64Index, diff --git a/pandas/core/base.py b/pandas/core/base.py index 3f59871fb5b38..e4274e48d3227 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -340,11 +340,15 @@ def _aggregate(self, arg, *args, **kwargs): def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 - warnings.warn( - ("using a dict with renaming " - "is deprecated and will be removed in a future " - "version"), - FutureWarning, stacklevel=level) + msg = textwrap.dedent("""\ + using a dict with renaming is deprecated and will be removed + in a future version. + + For column-specific groupby renaming, use named aggregation + + >>> df.groupby(...).agg(name=('column', aggfunc)) + """) + warnings.warn(msg, FutureWarning, stacklevel=level) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index ac35f3825e5e8..fe50bd91a4f56 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,4 +1,4 @@ -from pandas.core.groupby.groupby import GroupBy # noqa: F401 from pandas.core.groupby.generic import ( # noqa: F401 - SeriesGroupBy, DataFrameGroupBy) + DataFrameGroupBy, NamedAgg, SeriesGroupBy) +from pandas.core.groupby.groupby import GroupBy # noqa: F401 from pandas.core.groupby.grouper import Grouper # noqa: F401 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2f665975f96bd..faa4d868bb65a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -6,15 +6,18 @@ which here returns a DataFrameGroupBy object. """ -from collections import OrderedDict, abc +from collections import OrderedDict, abc, namedtuple import copy from functools import partial from textwrap import dedent +import typing +from typing import Any, Callable, List, Union import warnings import numpy as np from pandas._libs import Timestamp, lib +from pandas.compat import PY36 from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution @@ -41,6 +44,10 @@ from pandas.plotting._core import boxplot_frame_groupby +NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) +# TODO(typing) the return value on this callable should be any *scalar*. +AggScalar = Union[str, Callable[..., Any]] + class NDFrameGroupBy(GroupBy): @@ -144,8 +151,18 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, return new_items, new_blocks def aggregate(self, func, *args, **kwargs): - _level = kwargs.pop('_level', None) + + relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) + if relabeling: + func, columns, order = _normalize_keyword_aggregation(kwargs) + + kwargs = {} + elif func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of " + "'(column, aggfunc).") + result, how = self._aggregate(func, _level=_level, *args, **kwargs) if how is None: return result @@ -179,6 +196,10 @@ def aggregate(self, func, *args, **kwargs): self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) + if relabeling: + result = result[order] + result.columns = columns + return result._convert(datetime=True) agg = aggregate @@ -791,11 +812,8 @@ def _aggregate_multiple_funcs(self, arg, _level): # list of functions / function names columns = [] for f in arg: - if isinstance(f, str): - columns.append(f) - else: - # protect against callables without names - columns.append(com.get_callable_name(f)) + columns.append(com.get_callable_name(f) or f) + arg = zip(columns, arg) results = OrderedDict() @@ -1296,6 +1314,26 @@ class DataFrameGroupBy(NDFrameGroupBy): A 1 1 2 0.590716 2 3 4 0.704907 + + To control the output names with different aggregations per column, + pandas supports "named aggregation" + + >>> df.groupby("A").agg( + ... b_min=pd.NamedAgg(column="B", aggfunc="min"), + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) + b_min c_sum + A + 1 1 -1.956929 + 2 3 -0.322183 + + - The keywords are the *output* column names + - The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. + Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields + ``['column', 'aggfunc']`` to make it clearer what the arguments are. + As usual, the aggregation can be a callable or a string alias. + + See :ref:`groupby.aggregate.named` for more. """) @Substitution(see_also=_agg_see_also_doc, @@ -1304,7 +1342,7 @@ class DataFrameGroupBy(NDFrameGroupBy): klass='DataFrame', axis='') @Appender(_shared_docs['aggregate']) - def aggregate(self, arg, *args, **kwargs): + def aggregate(self, arg=None, *args, **kwargs): return super().aggregate(arg, *args, **kwargs) agg = aggregate @@ -1577,3 +1615,77 @@ def groupby_series(obj, col=None): return results boxplot = boxplot_frame_groupby + + +def _is_multi_agg_with_relabel(**kwargs): + """ + Check whether the kwargs pass to .agg look like multi-agg with relabling. + + Parameters + ---------- + **kwargs : dict + + Returns + ------- + bool + + Examples + -------- + >>> _is_multi_agg_with_relabel(a='max') + False + >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), + ... a_min=('a', 'min')) + True + >>> _is_multi_agg_with_relabel() + False + """ + return all( + isinstance(v, tuple) and len(v) == 2 + for v in kwargs.values() + ) and kwargs + + +def _normalize_keyword_aggregation(kwargs): + """ + Normalize user-provided "named aggregation" kwargs. + + Transforms from the new ``Dict[str, NamedAgg]`` style kwargs + to the old OrderedDict[str, List[scalar]]]. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + aggspec : dict + The transformed kwargs. + columns : List[str] + The user-provided keys. + order : List[Tuple[str, str]] + Pairs of the input and output column names. + + Examples + -------- + >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) + (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) + """ + if not PY36: + kwargs = OrderedDict(sorted(kwargs.items())) + + # Normalize the aggregation functions as Dict[column, List[func]], + # process normally, then fixup the names. + # TODO(Py35): When we drop python 3.5, change this to + # defaultdict(list) + aggspec = OrderedDict() # type: typing.OrderedDict[str, List[AggScalar]] + order = [] + columns, pairs = list(zip(*kwargs.items())) + + for name, (column, aggfunc) in zip(columns, pairs): + if column in aggspec: + aggspec[column].append(aggfunc) + else: + aggspec[column] = [aggfunc] + order.append((column, + com.get_callable_name(aggfunc) or aggfunc)) + return aggspec, columns, order diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index c92808200ebea..aa42484bf9513 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -47,6 +47,7 @@ class TestPDApi(Base): 'DatetimeTZDtype', 'Int8Dtype', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', + 'NamedAgg', ] # these are already deprecated; awaiting removal diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 6f54d05680698..9e714a1086037 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -2,12 +2,13 @@ test .agg behavior / note that .apply is tested generally in test_groupby.py """ from collections import OrderedDict +import functools import numpy as np import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, concat +from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -326,3 +327,101 @@ def test_uint64_type_handling(dtype, how): result = df.groupby('y').agg({'x': how}) result.x = result.x.astype(np.int64) tm.assert_frame_equal(result, expected, check_exact=True) + + +class TestNamedAggregation: + + def test_agg_relabel(self): + df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], + "A": [0, 1, 2, 3], + "B": [5, 6, 7, 8]}) + result = df.groupby("group").agg( + a_max=("A", "max"), + b_max=("B", "max"), + ) + expected = pd.DataFrame({"a_max": [1, 3], "b_max": [6, 8]}, + index=pd.Index(['a', 'b'], name='group'), + columns=['a_max', 'b_max']) + tm.assert_frame_equal(result, expected) + + # order invariance + p98 = functools.partial(np.percentile, q=98) + result = df.groupby('group').agg( + b_min=("B", "min"), + a_min=("A", min), + a_mean=("A", np.mean), + a_max=("A", "max"), + b_max=("B", "max"), + a_98=("A", p98) + ) + expected = pd.DataFrame({"b_min": [5, 7], + "a_min": [0, 2], + "a_mean": [0.5, 2.5], + "a_max": [1, 3], + "b_max": [6, 8], + "a_98": [0.98, 2.98]}, + index=pd.Index(['a', 'b'], name='group'), + columns=['b_min', 'a_min', 'a_mean', + 'a_max', 'b_max', 'a_98']) + if not compat.PY36: + expected = expected[['a_98', 'a_max', 'a_mean', + 'a_min', 'b_max', 'b_min']] + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_non_identifier(self): + df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], + "A": [0, 1, 2, 3], + "B": [5, 6, 7, 8]}) + + result = df.groupby("group").agg(**{'my col': ('A', 'max')}) + expected = pd.DataFrame({'my col': [1, 3]}, + index=pd.Index(['a', 'b'], name='group')) + tm.assert_frame_equal(result, expected) + + def test_duplicate_raises(self): + # TODO: we currently raise on multiple lambdas. We could *maybe* + # update com.get_callable_name to append `_i` to each lambda. + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + with pytest.raises(SpecificationError, match="Function names"): + df.groupby("A").agg(a=("A", "min"), b=("A", "min")) + + def test_agg_relabel_with_level(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product([['A', 'B'], + ['a', 'b']])) + result = df.groupby(level=0).agg(aa=('A', 'max'), bb=('A', 'min'), + cc=('B', 'mean')) + expected = pd.DataFrame({ + 'aa': [0, 1], + 'bb': [0, 1], + 'cc': [1.5, 3.5] + }, index=['A', 'B']) + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_other_raises(self): + df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]}) + grouped = df.groupby("A") + match = 'Must provide' + with pytest.raises(TypeError, match=match): + grouped.agg(foo=1) + + with pytest.raises(TypeError, match=match): + grouped.agg() + + with pytest.raises(TypeError, match=match): + grouped.agg(a=('B', 'max'), b=(1, 2, 3)) + + def test_missing_raises(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + with pytest.raises(KeyError, match="Column 'C' does not exist"): + df.groupby("A").agg(c=('C', 'sum')) + + def test_agg_namedtuple(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.groupby("A").agg( + b=pd.NamedAgg("B", "sum"), + c=pd.NamedAgg(column="B", aggfunc="count") + ) + expected = df.groupby("A").agg(b=("B", "sum"), + c=("B", "count")) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 02d8c09bf2c8f..8168cf06ffdb1 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -217,6 +217,7 @@ def test_agg_dict_renaming_deprecation(): df.groupby('A').agg({'B': {'foo': ['sum', 'max']}, 'C': {'bar': ['count', 'min']}}) assert "using a dict with renaming" in str(w[0].message) + assert "named aggregation" in str(w[0].message) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df.groupby('A')[['B', 'C']].agg({'ma': 'max'})