From aa43cf6007116c611242e6b35d7ca6bfd9a2b187 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 May 2019 16:33:26 -0500 Subject: [PATCH 01/22] ENH: Support nested renaming / selection --- doc/source/user_guide/groupby.rst | 24 +++++ doc/source/whatsnew/v0.25.0.rst | 23 +++++ pandas/core/groupby/generic.py | 92 +++++++++++++++++-- .../tests/groupby/aggregate/test_aggregate.py | 77 ++++++++++++++++ 4 files changed, 209 insertions(+), 7 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 4f116a42253e5..119b6a3ddde33 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -601,6 +601,30 @@ must be either implemented on GroupBy or available via :ref:`dispatching grouped.agg({'D': 'std', 'C': 'mean'}) grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')])) +.. versionadded:: 0.25.0 + +To support column-specific aggregation with control over the output column names, pandas +accepts the special syntax where + +1. The keywords are the *output* column names +2. The first element of each tuple is the column to select +3. The second element of each tuple is the aggregation function to apply to that column. + +.. ipython:: python + + grouped.agg(d_std=('D', 'std'), c_mean=('C', 'mean')) + +If your desired output column names are not valid python keywords, construct a dictionary +and unpack the keyword arguments + +.. ipython:: python + + grouped.agg(**{'d_std': ('D', 'std'), 'mean of C': ('C', 'mean')}) + +Additional keyword arguments are not passed through to the aggregation functions. Only pairs +of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions +requires additional arguments, partially apply them with :meth:`functools.partial`. + .. _groupby.aggregate.cython: Cython-optimized aggregation functions diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 49518c57fc846..bb0a71c631743 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -19,6 +19,29 @@ These are the changes in pandas 0.25.0. See :ref:`release` for a full changelog including other versions of pandas. +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_0250.enhancements.agg_relabel: + +Groupby Aggregation with Relabling +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas has added special syntax for naming the output columns when applying multiple aggreation functions to specific +columns. + +.. ipython:: python + + df = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], + 'height': [9.1, 6.0, 9.5, 34.0], + 'weight': [7.9, 7.5, 9.9, 198.0]}) + grouper = df.groupby("kind") + grouper.agg(max_height=('height', 'max'), average_weight=('weight', 'mean')) + +Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` +should be tuples where the first element is the column selection, and the second element is the +aggregation function to apply. + .. _whatsnew_0250.enhancements.other: Other Enhancements diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b5b6553d2ae69..bff4d9e86fc1f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -144,8 +144,30 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, return new_items, new_blocks def aggregate(self, func, *args, **kwargs): - _level = kwargs.pop('_level', None) + + relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) + if relabeling: + # Normalize the aggregation functions as Dict[column, List[func]], + # process normally, then fixup the names. + # TODO(Py35): When we drop python 3.5, change this to + # defaultdict(list) + func = OrderedDict() + order = [] + columns, pairs = list(zip(*kwargs.items())) + + for i, (name, (column, aggfunc)) in enumerate(zip(columns, pairs)): + if column in func: + func[column].append(aggfunc) + else: + func[column] = [aggfunc] + order.append((column, _get_agg_name(aggfunc))) + kwargs = {} + elif func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of " + "'(column, aggfunc).") + result, how = self._aggregate(func, _level=_level, *args, **kwargs) if how is None: return result @@ -179,6 +201,10 @@ def aggregate(self, func, *args, **kwargs): self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) + if relabeling: + result = result[order] + result.columns = columns + return result._convert(datetime=True) agg = aggregate @@ -791,11 +817,8 @@ def _aggregate_multiple_funcs(self, arg, _level): # list of functions / function names columns = [] for f in arg: - if isinstance(f, str): - columns.append(f) - else: - # protect against callables without names - columns.append(com.get_callable_name(f)) + columns.append(_get_agg_name(f)) + arg = zip(columns, arg) results = OrderedDict() @@ -1292,6 +1315,16 @@ class DataFrameGroupBy(NDFrameGroupBy): A 1 1 2 0.590716 2 3 4 0.704907 + + To control the output names with different aggregations + per column, pass tuples of ``(column, aggfunc))`` as kwargs + + >>> df.groupby("A").agg(b_min=("B", "min"), c_sum=("C", "sum")) + >>> + b_min c_sum + A + 1 1 0.825627 + 2 3 2.218618 """) @Substitution(see_also=_agg_see_also_doc, @@ -1300,7 +1333,7 @@ class DataFrameGroupBy(NDFrameGroupBy): klass='DataFrame', axis='') @Appender(_shared_docs['aggregate']) - def aggregate(self, arg, *args, **kwargs): + def aggregate(self, arg=None, *args, **kwargs): return super().aggregate(arg, *args, **kwargs) agg = aggregate @@ -1573,3 +1606,48 @@ def groupby_series(obj, col=None): return results boxplot = boxplot_frame_groupby + + +def _is_multi_agg_with_relabel(**kwargs): + """ + Check whether the kwargs pass to .agg look like multi-agg with relabling. + + Parameters + ---------- + **kwargs : dict + + Returns + ------- + bool + + Examples + -------- + >>> _is_multi_agg_with_relabel(a='max') + False + >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), + ... a_min=('a', 'min')) + True + >>> _is_multi_agg_with_relabel() + """ + return all( + isinstance(v, tuple) and len(v) == 2 + for v in kwargs.values() + ) and kwargs + + +def _get_agg_name(arg): + """ + + Parameters + ---------- + arg + + Returns + ------- + + """ + if isinstance(arg, str): + return arg + else: + # protect against callables without names + return com.get_callable_name(arg) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 66ea5ac244398..b4f67f6c20e1e 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -313,3 +313,80 @@ def test_order_aggregate_multiple_funcs(): expected = pd.Index(['sum', 'max', 'mean', 'ohlc', 'min']) tm.assert_index_equal(result, expected) + + +def test_agg_relabel(): + df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], + "A": [0, 1, 2, 3], + "B": [5, 6, 7, 8]}) + result = df.groupby("group").agg( + a_max=("A", "max"), + b_max=("B", "max"), + ) + expected = pd.DataFrame({ + "a_max": [1, 3], + "b_max": [6, 8], + }, index=pd.Index(['a', 'b'], name='group')) + tm.assert_frame_equal(result, expected) + + # order invariance + result = df.groupby('group').agg( + b_min=("B", "min"), + a_min=("A", min), + a_max=("A", "max"), + b_max=("B", "max"), + ) + expected = pd.DataFrame({"b_min": [5, 7], + "a_min": [0, 2], + "a_max": [1, 3], + "b_max": [6, 8]}, + index=pd.Index(['a', 'b'], name='group'), + columns=['b_min', 'a_min', 'a_max', 'b_max']) + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_non_identifier(): + df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], + "A": [0, 1, 2, 3], + "B": [5, 6, 7, 8]}) + + result = df.groupby("group").agg(**{'my col': ('A', 'max')}) + expected = pd.DataFrame({'my col': [1, 3]}, + index=pd.Index(['a', 'b'], name='group')) + tm.assert_frame_equal(result, expected) + + +def test_duplicate_raises(): + # TODO: we currently raise on multiple lambdas. We could *maybe* + # update com.get_callable_name to append `_i` to each lambda. + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + with pytest.raises(SpecificationError, match="Function names"): + df.groupby("A").agg(a=("A", "min"), b=("A", "min")) + + +def test_agg_relabel_with_level(): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product([['A', 'B'], + ['a', 'b']])) + result = df.groupby(level=0).agg(aa=('A', 'max'), bb=('A', 'min'), + cc=('B', 'mean')) + expected = pd.DataFrame({ + 'aa': [0, 1], + 'bb': [0, 1], + 'cc': [1.5, 3.5] + }, index=['A', 'B']) + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_other_raises(): + df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]}) + grouped = df.groupby("A") + match = 'Must provide' + with pytest.raises(TypeError, match=match): + grouped.agg(foo=1) + + with pytest.raises(TypeError, match=match): + grouped.agg() + + with pytest.raises(TypeError, match=match): + grouped.agg(a=('B', 'max'), b=(1, 2, 3)) From 10c8f4029e249b6c2fa356b4c3dc7d889f39a30b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 May 2019 13:32:18 -0500 Subject: [PATCH 02/22] 3.5 note --- doc/source/user_guide/groupby.rst | 32 +++- doc/source/whatsnew/v0.25.0.rst | 10 +- pandas/core/groupby/generic.py | 5 + .../tests/groupby/aggregate/test_aggregate.py | 156 ++++++++++-------- 4 files changed, 119 insertions(+), 84 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 119b6a3ddde33..3b27e80bc220c 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -601,30 +601,50 @@ must be either implemented on GroupBy or available via :ref:`dispatching grouped.agg({'D': 'std', 'C': 'mean'}) grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')])) +.. _groupby.aggregate.keyword: + .. versionadded:: 0.25.0 To support column-specific aggregation with control over the output column names, pandas -accepts the special syntax where +accepts the special syntax in :meth:`GroupBy.agg`, known as "keyword aggregation", where -1. The keywords are the *output* column names -2. The first element of each tuple is the column to select -3. The second element of each tuple is the aggregation function to apply to that column. +- The keywords are the *output* column names +- The values are tuples whose first element is the column to select + and the second element is the function to apply to that column. .. ipython:: python - grouped.agg(d_std=('D', 'std'), c_mean=('C', 'mean')) + animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], + 'height': [9.1, 6.0, 9.5, 34.0], + 'weight': [7.9, 7.5, 9.9, 198.0]}) + animals + + animals.groupby("kind").agg( + min_height=('height', 'min'), + max_height=('height', 'max'), + average_weight=('height', np.mean), + ) If your desired output column names are not valid python keywords, construct a dictionary and unpack the keyword arguments .. ipython:: python - grouped.agg(**{'d_std': ('D', 'std'), 'mean of C': ('C', 'mean')}) + animals.groupby("kind").agg(**{ + 'total weight': ('weight', sum), + }) Additional keyword arguments are not passed through to the aggregation functions. Only pairs of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions requires additional arguments, partially apply them with :meth:`functools.partial`. +.. note:: + + For Python 3.5 and earlier, the order of ``**kwargs`` in a functions was not + preserved. Because the indeterminate keyword ordering would result in indeterminate + output column ordering, keyword aggregation is not supported for Python 3.5. A + ``RuntimeError`` will be raised instead. + .. _groupby.aggregate.cython: Cython-optimized aggregation functions diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1f62db35f260f..b68d80a4e6dcb 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -24,11 +24,11 @@ Enhancements .. _whatsnew_0250.enhancements.agg_relabel: -Groupby Aggregation with Relabling -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Groupby Aggregation with Relabeling +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has added special syntax for naming the output columns when applying multiple aggreation functions to specific -columns. +Pandas has added special groupby behavior, known as "keyword aggregation", for naming the +output columns when applying multiple aggregation functions to specific columns (:issue:`18366`). .. ipython:: python @@ -40,7 +40,7 @@ columns. Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` should be tuples where the first element is the column selection, and the second element is the -aggregation function to apply. +aggregation function to apply. See :ref:`_groupby.aggregate.keyword` for more. .. _whatsnew_0250.enhancements.other: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 987c3fc01d87c..2a7e53d3e207f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -15,6 +15,7 @@ import numpy as np from pandas._libs import Timestamp, lib +from pandas.compat import PY36 from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution @@ -148,6 +149,10 @@ def aggregate(self, func, *args, **kwargs): relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: + if not PY36: + raise RuntimeError("Keyword aggregation is not supported " + "on Python 3.5.") + # Normalize the aggregation functions as Dict[column, List[func]], # process normally, then fixup the names. # TODO(Py35): When we drop python 3.5, change this to diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b4f67f6c20e1e..9a676f1e7f87e 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -7,7 +7,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, concat +from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -315,78 +315,88 @@ def test_order_aggregate_multiple_funcs(): tm.assert_index_equal(result, expected) -def test_agg_relabel(): +@pytest.mark.skipif(not compat.PY36, + reason="Keyword aggregation requires 3.6 or above.") +class TestKeywordAggregation: + + def test_agg_relabel(self): + df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], + "A": [0, 1, 2, 3], + "B": [5, 6, 7, 8]}) + result = df.groupby("group").agg( + a_max=("A", "max"), + b_max=("B", "max"), + ) + expected = pd.DataFrame({"a_max": [1, 3], "b_max": [6, 8]}, + index=pd.Index(['a', 'b'], name='group'), + columns=['a_max', 'b_max']) + tm.assert_frame_equal(result, expected) + + # order invariance + result = df.groupby('group').agg( + b_min=("B", "min"), + a_min=("A", min), + a_max=("A", "max"), + b_max=("B", "max"), + ) + expected = pd.DataFrame({"b_min": [5, 7], + "a_min": [0, 2], + "a_max": [1, 3], + "b_max": [6, 8]}, + index=pd.Index(['a', 'b'], name='group'), + columns=['b_min', 'a_min', 'a_max', 'b_max']) + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_non_identifier(self): + df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], + "A": [0, 1, 2, 3], + "B": [5, 6, 7, 8]}) + + result = df.groupby("group").agg(**{'my col': ('A', 'max')}) + expected = pd.DataFrame({'my col': [1, 3]}, + index=pd.Index(['a', 'b'], name='group')) + tm.assert_frame_equal(result, expected) + + def test_duplicate_raises(self): + # TODO: we currently raise on multiple lambdas. We could *maybe* + # update com.get_callable_name to append `_i` to each lambda. + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + with pytest.raises(SpecificationError, match="Function names"): + df.groupby("A").agg(a=("A", "min"), b=("A", "min")) + + def test_agg_relabel_with_level(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product([['A', 'B'], + ['a', 'b']])) + result = df.groupby(level=0).agg(aa=('A', 'max'), bb=('A', 'min'), + cc=('B', 'mean')) + expected = pd.DataFrame({ + 'aa': [0, 1], + 'bb': [0, 1], + 'cc': [1.5, 3.5] + }, index=['A', 'B']) + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_other_raises(self): + df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]}) + grouped = df.groupby("A") + match = 'Must provide' + with pytest.raises(TypeError, match=match): + grouped.agg(foo=1) + + with pytest.raises(TypeError, match=match): + grouped.agg() + + with pytest.raises(TypeError, match=match): + grouped.agg(a=('B', 'max'), b=(1, 2, 3)) + + +@pytest.mark.skipif(compat.PY36, + reason="Keyword aggregation supported on 3.6+") +def test_agg_relabel_35_raises(): df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}) - result = df.groupby("group").agg( - a_max=("A", "max"), - b_max=("B", "max"), - ) - expected = pd.DataFrame({ - "a_max": [1, 3], - "b_max": [6, 8], - }, index=pd.Index(['a', 'b'], name='group')) - tm.assert_frame_equal(result, expected) - - # order invariance - result = df.groupby('group').agg( - b_min=("B", "min"), - a_min=("A", min), - a_max=("A", "max"), - b_max=("B", "max"), - ) - expected = pd.DataFrame({"b_min": [5, 7], - "a_min": [0, 2], - "a_max": [1, 3], - "b_max": [6, 8]}, - index=pd.Index(['a', 'b'], name='group'), - columns=['b_min', 'a_min', 'a_max', 'b_max']) - tm.assert_frame_equal(result, expected) - - -def test_agg_relabel_non_identifier(): - df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], - "A": [0, 1, 2, 3], - "B": [5, 6, 7, 8]}) - - result = df.groupby("group").agg(**{'my col': ('A', 'max')}) - expected = pd.DataFrame({'my col': [1, 3]}, - index=pd.Index(['a', 'b'], name='group')) - tm.assert_frame_equal(result, expected) - - -def test_duplicate_raises(): - # TODO: we currently raise on multiple lambdas. We could *maybe* - # update com.get_callable_name to append `_i` to each lambda. - df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) - with pytest.raises(SpecificationError, match="Function names"): - df.groupby("A").agg(a=("A", "min"), b=("A", "min")) - - -def test_agg_relabel_with_level(): - df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, - index=pd.MultiIndex.from_product([['A', 'B'], - ['a', 'b']])) - result = df.groupby(level=0).agg(aa=('A', 'max'), bb=('A', 'min'), - cc=('B', 'mean')) - expected = pd.DataFrame({ - 'aa': [0, 1], - 'bb': [0, 1], - 'cc': [1.5, 3.5] - }, index=['A', 'B']) - tm.assert_frame_equal(result, expected) - - -def test_agg_relabel_other_raises(): - df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]}) - grouped = df.groupby("A") - match = 'Must provide' - with pytest.raises(TypeError, match=match): - grouped.agg(foo=1) - - with pytest.raises(TypeError, match=match): - grouped.agg() - - with pytest.raises(TypeError, match=match): - grouped.agg(a=('B', 'max'), b=(1, 2, 3)) + gr = df.groupby('group') + with pytest.raises(RuntimeError): + gr.agg(foo=("A", "sum")) From 2e52653620ddf15e413309cc83260298b8b90e55 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 May 2019 16:49:13 -0500 Subject: [PATCH 03/22] sort for py35 --- pandas/core/groupby/generic.py | 3 +-- .../tests/groupby/aggregate/test_aggregate.py | 17 +++-------------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2a7e53d3e207f..6e28cff3a7567 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -150,8 +150,7 @@ def aggregate(self, func, *args, **kwargs): relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: if not PY36: - raise RuntimeError("Keyword aggregation is not supported " - "on Python 3.5.") + kwargs = OrderedDict(sorted(kwargs.items())) # Normalize the aggregation functions as Dict[column, List[func]], # process normally, then fixup the names. diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 9a676f1e7f87e..78de70acd2301 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -7,7 +7,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +from pandas import DataFrame, Index, MultiIndex, Series, concat, compat from pandas.core.base import SpecificationError from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -315,8 +315,6 @@ def test_order_aggregate_multiple_funcs(): tm.assert_index_equal(result, expected) -@pytest.mark.skipif(not compat.PY36, - reason="Keyword aggregation requires 3.6 or above.") class TestKeywordAggregation: def test_agg_relabel(self): @@ -345,6 +343,8 @@ def test_agg_relabel(self): "b_max": [6, 8]}, index=pd.Index(['a', 'b'], name='group'), columns=['b_min', 'a_min', 'a_max', 'b_max']) + if not compat.PY36: + expected = expected[['a_max', 'a_min', 'b_max', 'b_min']] tm.assert_frame_equal(result, expected) def test_agg_relabel_non_identifier(self): @@ -389,14 +389,3 @@ def test_agg_relabel_other_raises(self): with pytest.raises(TypeError, match=match): grouped.agg(a=('B', 'max'), b=(1, 2, 3)) - - -@pytest.mark.skipif(compat.PY36, - reason="Keyword aggregation supported on 3.6+") -def test_agg_relabel_35_raises(): - df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], - "A": [0, 1, 2, 3], - "B": [5, 6, 7, 8]}) - gr = df.groupby('group') - with pytest.raises(RuntimeError): - gr.agg(foo=("A", "sum")) From 06a86ec4924d33cc73399a21bdab4781d22935e0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 May 2019 16:51:03 -0500 Subject: [PATCH 04/22] sort for py35 --- doc/source/user_guide/groupby.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 3b27e80bc220c..7de6da3e36b4f 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -642,8 +642,7 @@ requires additional arguments, partially apply them with :meth:`functools.partia For Python 3.5 and earlier, the order of ``**kwargs`` in a functions was not preserved. Because the indeterminate keyword ordering would result in indeterminate - output column ordering, keyword aggregation is not supported for Python 3.5. A - ``RuntimeError`` will be raised instead. + output column ordering, the output columns will always be sorted for Python 3.5. .. _groupby.aggregate.cython: From 14f66e6ad564a7668cc5416234e8295ced7611c0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 May 2019 10:34:59 -0500 Subject: [PATCH 05/22] updates --- doc/source/user_guide/groupby.rst | 5 +++- doc/source/whatsnew/v0.25.0.rst | 14 +++++++---- pandas/core/groupby/generic.py | 23 +++---------------- .../tests/groupby/aggregate/test_aggregate.py | 21 +++++++++++++---- 4 files changed, 33 insertions(+), 30 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 7de6da3e36b4f..1d9122bd2294d 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -603,9 +603,12 @@ must be either implemented on GroupBy or available via :ref:`dispatching .. _groupby.aggregate.keyword: +Keyword Aggregation +~~~~~~~~~~~~~~~~~~~ + .. versionadded:: 0.25.0 -To support column-specific aggregation with control over the output column names, pandas +To support column-specific aggregation *with control over the output column names*, pandas accepts the special syntax in :meth:`GroupBy.agg`, known as "keyword aggregation", where - The keywords are the *output* column names diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 665d2ad5bd9f3..150c5f7258da7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -32,11 +32,15 @@ output columns when applying multiple aggregation functions to specific columns .. ipython:: python - df = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], - 'height': [9.1, 6.0, 9.5, 34.0], - 'weight': [7.9, 7.5, 9.9, 198.0]}) - grouper = df.groupby("kind") - grouper.agg(max_height=('height', 'max'), average_weight=('weight', 'mean')) + animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], + 'height': [9.1, 6.0, 9.5, 34.0], + 'weight': [7.9, 7.5, 9.9, 198.0]}) + animals + animals.groupby("kind").agg( + min_height=('height', 'min'), + max_height=('height', 'max'), + average_weight=('height', np.mean), + ) Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` should be tuples where the first element is the column selection, and the second element is the diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6e28cff3a7567..9c853ea8ee41d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -165,7 +165,8 @@ def aggregate(self, func, *args, **kwargs): func[column].append(aggfunc) else: func[column] = [aggfunc] - order.append((column, _get_agg_name(aggfunc))) + order.append((column, + com.get_callable_name(aggfunc) or aggfunc)) kwargs = {} elif func is None: # nicer error message @@ -821,7 +822,7 @@ def _aggregate_multiple_funcs(self, arg, _level): # list of functions / function names columns = [] for f in arg: - columns.append(_get_agg_name(f)) + columns.append(com.get_callable_name(f) or f) arg = zip(columns, arg) @@ -1641,21 +1642,3 @@ def _is_multi_agg_with_relabel(**kwargs): isinstance(v, tuple) and len(v) == 2 for v in kwargs.values() ) and kwargs - - -def _get_agg_name(arg): - """ - - Parameters - ---------- - arg - - Returns - ------- - - """ - if isinstance(arg, str): - return arg - else: - # protect against callables without names - return com.get_callable_name(arg) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 78de70acd2301..27638baf8564b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -2,12 +2,13 @@ test .agg behavior / note that .apply is tested generally in test_groupby.py """ from collections import OrderedDict +import functools import numpy as np import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, concat, compat +from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -331,20 +332,27 @@ def test_agg_relabel(self): tm.assert_frame_equal(result, expected) # order invariance + p98 = functools.partial(np.percentile, q=98) result = df.groupby('group').agg( b_min=("B", "min"), a_min=("A", min), + a_mean=("A", np.mean), a_max=("A", "max"), b_max=("B", "max"), + a_98=("A", p98) ) expected = pd.DataFrame({"b_min": [5, 7], "a_min": [0, 2], + "a_mean": [0.5, 2.5], "a_max": [1, 3], - "b_max": [6, 8]}, + "b_max": [6, 8], + "a_98": [0.98, 2.98]}, index=pd.Index(['a', 'b'], name='group'), - columns=['b_min', 'a_min', 'a_max', 'b_max']) + columns=['b_min', 'a_min', 'a_mean', + 'a_max', 'b_max', 'a_98']) if not compat.PY36: - expected = expected[['a_max', 'a_min', 'b_max', 'b_min']] + expected = expected[['a_98', 'a_max', 'a_mean', + 'a_min', 'b_max', 'b_min']] tm.assert_frame_equal(result, expected) def test_agg_relabel_non_identifier(self): @@ -389,3 +397,8 @@ def test_agg_relabel_other_raises(self): with pytest.raises(TypeError, match=match): grouped.agg(a=('B', 'max'), b=(1, 2, 3)) + + def test_missing_raises(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + with pytest.raises(KeyError, match="Column 'C' does not exist"): + df.groupby("A").agg(c=('C', 'sum')) From 2c3d11a4e2ce2840c84dc58cb4c8426e58d3cc6f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 May 2019 14:58:11 -0500 Subject: [PATCH 06/22] added Agg helper --- doc/source/user_guide/groupby.rst | 10 ++++---- doc/source/whatsnew/v0.25.0.rst | 9 +++++--- pandas/__init__.py | 2 +- pandas/core/api.py | 2 +- pandas/core/groupby/__init__.py | 2 +- pandas/core/groupby/generic.py | 23 +++++++++++++------ pandas/tests/api/test_api.py | 1 + .../tests/groupby/aggregate/test_aggregate.py | 8 +++++++ 8 files changed, 40 insertions(+), 17 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 1d9122bd2294d..e809b5c5ab384 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -613,7 +613,9 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "keyword aggregation - The keywords are the *output* column names - The values are tuples whose first element is the column to select - and the second element is the function to apply to that column. + and the second element is the function to apply to that column. Pandas + provides the ``pandas.Agg`` namedtuple with the fields ``['column', 'aggfunc']`` + to make it clearer what the arguments are. .. ipython:: python @@ -623,8 +625,8 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "keyword aggregation animals animals.groupby("kind").agg( - min_height=('height', 'min'), - max_height=('height', 'max'), + min_height=pd.Agg(column='height', aggfunc='min')), + max_height=pd.Agg(column='height', aggfunc='max')), average_weight=('height', np.mean), ) @@ -634,7 +636,7 @@ and unpack the keyword arguments .. ipython:: python animals.groupby("kind").agg(**{ - 'total weight': ('weight', sum), + 'total weight': pd.Agg(column='weight', aggfunc=sum)), }) Additional keyword arguments are not passed through to the aggregation functions. Only pairs diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 150c5f7258da7..ff9c46bc8d24e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -37,14 +37,17 @@ output columns when applying multiple aggregation functions to specific columns 'weight': [7.9, 7.5, 9.9, 198.0]}) animals animals.groupby("kind").agg( - min_height=('height', 'min'), - max_height=('height', 'max'), + min_height=pd.Agg(column='height', aggfunc='min')), + max_height=pd.Agg(column='height', aggfunc='max')), average_weight=('height', np.mean), ) Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` should be tuples where the first element is the column selection, and the second element is the -aggregation function to apply. See :ref:`_groupby.aggregate.keyword` for more. +aggregation function to apply. Pandas provides the ``pandas.Agg`` namedtuple to make it clearer +what the arguments to the function are, but plain tuples are accepted as well. + +See :ref:`_groupby.aggregate.keyword` for more. .. _whatsnew_0250.enhancements.other: diff --git a/pandas/__init__.py b/pandas/__init__.py index bd367bbe27d5e..8f8c4d34033cf 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -65,7 +65,7 @@ to_numeric, to_datetime, to_timedelta, # misc - np, TimeGrouper, Grouper, factorize, unique, value_counts, + np, Agg, TimeGrouper, Grouper, factorize, unique, value_counts, array, Categorical, set_eng_float_format, Series, DataFrame, Panel) diff --git a/pandas/core/api.py b/pandas/core/api.py index 96f623bda9a8a..79bd7314a7147 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -21,7 +21,7 @@ DatetimeTZDtype, ) from pandas.core.arrays import Categorical, array -from pandas.core.groupby import Grouper +from pandas.core.groupby import Grouper, Agg from pandas.io.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex, Float64Index, diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index ac35f3825e5e8..7f08dbd3baba7 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,4 +1,4 @@ from pandas.core.groupby.groupby import GroupBy # noqa: F401 from pandas.core.groupby.generic import ( # noqa: F401 - SeriesGroupBy, DataFrameGroupBy) + Agg, SeriesGroupBy, DataFrameGroupBy) from pandas.core.groupby.grouper import Grouper # noqa: F401 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9c853ea8ee41d..901e2dc361cbb 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -6,7 +6,7 @@ which here returns a DataFrameGroupBy object. """ -from collections import OrderedDict, abc +from collections import OrderedDict, abc, namedtuple import copy from functools import partial from textwrap import dedent @@ -42,6 +42,8 @@ from pandas.plotting._core import boxplot_frame_groupby +Agg = namedtuple("NamedAgg", ["column", "aggfunc"]) + class NDFrameGroupBy(GroupBy): @@ -1326,14 +1328,21 @@ class DataFrameGroupBy(NDFrameGroupBy): 2 3 4 0.704907 To control the output names with different aggregations - per column, pass tuples of ``(column, aggfunc))`` as kwargs + per column, pass supports "keyword aggregation" - >>> df.groupby("A").agg(b_min=("B", "min"), c_sum=("C", "sum")) - >>> - b_min c_sum + >>> df.groupby("A").agg( + ... b_min=pd.Agg(column="B", aggfunc="min"), + ... c_sum=pd.Agg(column="C", aggfunc="sum")) + b_min c_sum A - 1 1 0.825627 - 2 3 2.218618 + 1 1 -1.956929 + 2 3 -0.322183 + + The keywords are the column names, and the values should be + 2-tuples where the first element is the column selection and + the second element is the aggfunc. Pandas provides the + ``pandas.Agg`` namedtuple to clarify the meaning of the values. + See :ref:`groupby.aggregate.keyword` for more. """) @Substitution(see_also=_agg_see_also_doc, diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 7ee0225723675..d87b8b6c862d4 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -47,6 +47,7 @@ class TestPDApi(Base): 'DatetimeTZDtype', 'Int8Dtype', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', + 'Agg', ] # these are already deprecated; awaiting removal diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 27638baf8564b..e21f93ce04193 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -402,3 +402,11 @@ def test_missing_raises(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) with pytest.raises(KeyError, match="Column 'C' does not exist"): df.groupby("A").agg(c=('C', 'sum')) + + def test_agg_namedtuple(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.groupby("A").agg(b=pd.Agg("B", "sum"), + c=pd.Agg(column="B", aggfunc="count")) + expected = df.groupby("A").agg(b=("B", "sum"), + c=("B", "count")) + tm.assert_frame_equal(result, expected) From cdf93735505de01a983fded3d27d3887572db80a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 May 2019 15:30:12 -0500 Subject: [PATCH 07/22] Agg -> KeywordAgg --- doc/source/user_guide/groupby.rst | 6 +++--- doc/source/whatsnew/v0.25.0.rst | 4 ++-- pandas/__init__.py | 2 +- pandas/core/api.py | 2 +- pandas/core/groupby/__init__.py | 4 ++-- pandas/core/groupby/generic.py | 6 +++--- pandas/tests/groupby/aggregate/test_aggregate.py | 6 ++++-- 7 files changed, 16 insertions(+), 14 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index e809b5c5ab384..2fea3a979bfba 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -625,8 +625,8 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "keyword aggregation animals animals.groupby("kind").agg( - min_height=pd.Agg(column='height', aggfunc='min')), - max_height=pd.Agg(column='height', aggfunc='max')), + min_height=pd.KeywordAgg(column='height', aggfunc='min')), + max_height=pd.KeywordAgg(column='height', aggfunc='max')), average_weight=('height', np.mean), ) @@ -636,7 +636,7 @@ and unpack the keyword arguments .. ipython:: python animals.groupby("kind").agg(**{ - 'total weight': pd.Agg(column='weight', aggfunc=sum)), + 'total weight': pd.KeywordAgg(column='weight', aggfunc=sum)), }) Additional keyword arguments are not passed through to the aggregation functions. Only pairs diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ff9c46bc8d24e..57e8d67a37505 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -37,8 +37,8 @@ output columns when applying multiple aggregation functions to specific columns 'weight': [7.9, 7.5, 9.9, 198.0]}) animals animals.groupby("kind").agg( - min_height=pd.Agg(column='height', aggfunc='min')), - max_height=pd.Agg(column='height', aggfunc='max')), + min_height=pd.KeywordAgg(column='height', aggfunc='min')), + max_height=pd.KeywordAgg(column='height', aggfunc='max')), average_weight=('height', np.mean), ) diff --git a/pandas/__init__.py b/pandas/__init__.py index 8f8c4d34033cf..feeac6000b35d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -65,7 +65,7 @@ to_numeric, to_datetime, to_timedelta, # misc - np, Agg, TimeGrouper, Grouper, factorize, unique, value_counts, + np, KeywordAgg, TimeGrouper, Grouper, factorize, unique, value_counts, array, Categorical, set_eng_float_format, Series, DataFrame, Panel) diff --git a/pandas/core/api.py b/pandas/core/api.py index 79bd7314a7147..75936a4bc6321 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -21,7 +21,7 @@ DatetimeTZDtype, ) from pandas.core.arrays import Categorical, array -from pandas.core.groupby import Grouper, Agg +from pandas.core.groupby import Grouper, KeywordAgg from pandas.io.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex, Float64Index, diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index 7f08dbd3baba7..e6616e835b662 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,4 +1,4 @@ -from pandas.core.groupby.groupby import GroupBy # noqa: F401 from pandas.core.groupby.generic import ( # noqa: F401 - Agg, SeriesGroupBy, DataFrameGroupBy) + DataFrameGroupBy, KeywordAgg, SeriesGroupBy) +from pandas.core.groupby.groupby import GroupBy # noqa: F401 from pandas.core.groupby.grouper import Grouper # noqa: F401 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 901e2dc361cbb..bfa889075c65f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -42,7 +42,7 @@ from pandas.plotting._core import boxplot_frame_groupby -Agg = namedtuple("NamedAgg", ["column", "aggfunc"]) +KeywordAgg = namedtuple("KeywordAgg", ["column", "aggfunc"]) class NDFrameGroupBy(GroupBy): @@ -1331,8 +1331,8 @@ class DataFrameGroupBy(NDFrameGroupBy): per column, pass supports "keyword aggregation" >>> df.groupby("A").agg( - ... b_min=pd.Agg(column="B", aggfunc="min"), - ... c_sum=pd.Agg(column="C", aggfunc="sum")) + ... b_min=pd.KeywordAgg(column="B", aggfunc="min"), + ... c_sum=pd.KeywordAgg(column="C", aggfunc="sum")) b_min c_sum A 1 1 -1.956929 diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e21f93ce04193..2c03c5a7e095d 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -405,8 +405,10 @@ def test_missing_raises(self): def test_agg_namedtuple(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) - result = df.groupby("A").agg(b=pd.Agg("B", "sum"), - c=pd.Agg(column="B", aggfunc="count")) + result = df.groupby("A").agg( + b=pd.KeywordAgg("B", "sum"), + c=pd.KeywordAgg(column="B", aggfunc="count") + ) expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count")) tm.assert_frame_equal(result, expected) From c0cd575f53118d9588eef8e012266d3b300da161 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 May 2019 09:22:36 -0500 Subject: [PATCH 08/22] doc fixups --- doc/source/user_guide/groupby.rst | 6 +++--- doc/source/whatsnew/v0.25.0.rst | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 2fea3a979bfba..bcc46df5003ff 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -625,8 +625,8 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "keyword aggregation animals animals.groupby("kind").agg( - min_height=pd.KeywordAgg(column='height', aggfunc='min')), - max_height=pd.KeywordAgg(column='height', aggfunc='max')), + min_height=pd.KeywordAgg(column='height', aggfunc='min'), + max_height=pd.KeywordAgg(column='height', aggfunc='max'), average_weight=('height', np.mean), ) @@ -636,7 +636,7 @@ and unpack the keyword arguments .. ipython:: python animals.groupby("kind").agg(**{ - 'total weight': pd.KeywordAgg(column='weight', aggfunc=sum)), + 'total weight': pd.KeywordAgg(column='weight', aggfunc=sum), }) Additional keyword arguments are not passed through to the aggregation functions. Only pairs diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 57e8d67a37505..ed1d00f1c46eb 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -37,8 +37,8 @@ output columns when applying multiple aggregation functions to specific columns 'weight': [7.9, 7.5, 9.9, 198.0]}) animals animals.groupby("kind").agg( - min_height=pd.KeywordAgg(column='height', aggfunc='min')), - max_height=pd.KeywordAgg(column='height', aggfunc='max')), + min_height=pd.KeywordAgg(column='height', aggfunc='min'), + max_height=pd.KeywordAgg(column='height', aggfunc='max'), average_weight=('height', np.mean), ) From 386cca1402b42bc1eaa4a4571cae2190b97d0001 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 May 2019 10:41:30 -0500 Subject: [PATCH 09/22] fix api test --- pandas/tests/api/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index d87b8b6c862d4..7af081803f482 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -47,7 +47,7 @@ class TestPDApi(Base): 'DatetimeTZDtype', 'Int8Dtype', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', - 'Agg', + 'KeywordAgg', ] # these are already deprecated; awaiting removal From 2f6e1dc14c1ef2a8986d5a8437638d46f3fab7f0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 May 2019 11:17:45 -0500 Subject: [PATCH 10/22] wip --- pandas/core/base.py | 7 ++++++ pandas/core/frame.py | 2 +- pandas/core/groupby/generic.py | 41 ++++++++++++++++++-------------- pandas/tests/frame/test_apply.py | 19 +++++++++++++++ 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 1d0e7fc413eb9..76f9d2fee6e62 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -326,6 +326,8 @@ def _aggregate(self, arg, *args, **kwargs): how can be a string describe the required post-processing, or None if not required """ + from pandas.core.groupby.generic import ( + _is_multi_agg_with_relabel, _normalize_keyword_aggregation) is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False @@ -333,6 +335,11 @@ def _aggregate(self, arg, *args, **kwargs): if _axis is None: _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) + is_relabeling = arg is None and _is_multi_agg_with_relabel(**kwargs) + if is_relabeling: + arg, columns, order = _normalize_keyword_aggregation(kwargs) + args = () # TODO: test + kwargs = {} if isinstance(arg, str): return self._try_aggregate_string_function(arg, *args, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 76fd393341694..e70d2a05ac02c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6321,7 +6321,7 @@ def _gotitem(self, versionadded='\n.. versionadded:: 0.20.0\n', **_shared_doc_kwargs) @Appender(_shared_docs['aggregate']) - def aggregate(self, func, axis=0, *args, **kwargs): + def aggregate(self, func=None, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) result = None diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bfa889075c65f..7e6a0834c61f5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -151,24 +151,8 @@ def aggregate(self, func, *args, **kwargs): relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: - if not PY36: - kwargs = OrderedDict(sorted(kwargs.items())) - - # Normalize the aggregation functions as Dict[column, List[func]], - # process normally, then fixup the names. - # TODO(Py35): When we drop python 3.5, change this to - # defaultdict(list) - func = OrderedDict() - order = [] - columns, pairs = list(zip(*kwargs.items())) - - for i, (name, (column, aggfunc)) in enumerate(zip(columns, pairs)): - if column in func: - func[column].append(aggfunc) - else: - func[column] = [aggfunc] - order.append((column, - com.get_callable_name(aggfunc) or aggfunc)) + func, columns, order = _normalize_keyword_aggregation(kwargs) + kwargs = {} elif func is None: # nicer error message @@ -1651,3 +1635,24 @@ def _is_multi_agg_with_relabel(**kwargs): isinstance(v, tuple) and len(v) == 2 for v in kwargs.values() ) and kwargs + + +def _normalize_keyword_aggregation(kwargs): + if not PY36: + kwargs = OrderedDict(sorted(kwargs.items())) + # Normalize the aggregation functions as Dict[column, List[func]], + # process normally, then fixup the names. + # TODO(Py35): When we drop python 3.5, change this to + # defaultdict(list) + func = OrderedDict() + order = [] + columns, pairs = list(zip(*kwargs.items())) + + for i, (name, (column, aggfunc)) in enumerate(zip(columns, pairs)): + if column in func: + func[column].append(aggfunc) + else: + func[column] = [aggfunc] + order.append((column, + com.get_callable_name(aggfunc) or aggfunc)) + return func, columns, order diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 0c09956b3f2fb..7aecb0f255ce0 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1153,3 +1153,22 @@ def test_frequency_is_original(self, num_cols): df = DataFrame(1, index=index, columns=range(num_cols)) df.apply(lambda x: x) assert index.freq == original.freq + + +class TestKeywordAgg: + def test_basic(self): + df = pd.DataFrame({"A": [1, 2]}) + agg_funcs = ['sum', 'mean', 'min', 'max'] + result = df.agg(**{k: ("A", k) for k in agg_funcs}) + expected = df.agg(agg_funcs) + tm.assert_frame_equal(result, expected) + + result = df.agg(**{k: pd.KeywordAgg("A", k) for k in agg_funcs}) + expected = df.agg(agg_funcs) + tm.assert_frame_equal(result, expected) + + def test_multiple(self): + df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b']}) + result = df.agg(a_min=("A", np.min), b_min=("B", "min")) + expected = df.agg('min') + tm.assert_frame_equal(result, expected) From 6c1f56782960d52b324a956479e0917fbd8144b1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 May 2019 14:34:18 -0500 Subject: [PATCH 11/22] fixups --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/groupby/generic.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5d0d9b2f39ad4..f96a102d98b37 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -44,7 +44,7 @@ output columns when applying multiple aggregation functions to specific columns Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` should be tuples where the first element is the column selection, and the second element is the -aggregation function to apply. Pandas provides the ``pandas.Agg`` namedtuple to make it clearer +aggregation function to apply. Pandas provides the ``pandas.KeywordAgg`` namedtuple to make it clearer what the arguments to the function are, but plain tuples are accepted as well. See :ref:`_groupby.aggregate.keyword` for more. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7e6a0834c61f5..572de0e5d9ae1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1325,7 +1325,7 @@ class DataFrameGroupBy(NDFrameGroupBy): The keywords are the column names, and the values should be 2-tuples where the first element is the column selection and the second element is the aggfunc. Pandas provides the - ``pandas.Agg`` namedtuple to clarify the meaning of the values. + ``pandas.KeywordAgg`` namedtuple to clarify the meaning of the values. See :ref:`groupby.aggregate.keyword` for more. """) @@ -1648,7 +1648,7 @@ def _normalize_keyword_aggregation(kwargs): order = [] columns, pairs = list(zip(*kwargs.items())) - for i, (name, (column, aggfunc)) in enumerate(zip(columns, pairs)): + for name, (column, aggfunc) in zip(columns, pairs): if column in func: func[column].append(aggfunc) else: From bcc63f57af074ed359031b20e64d48b4c6ae78fd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 May 2019 14:46:30 -0500 Subject: [PATCH 12/22] fixup --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3ccf4ac4b4e49..afe37bf198ab7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6327,7 +6327,7 @@ def _gotitem(self, versionadded='\n.. versionadded:: 0.20.0\n', **_shared_doc_kwargs) @Appender(_shared_docs['aggregate']) - def aggregate(self, func=None, axis=0, *args, **kwargs): + def aggregate(self, func, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) result = None From 769a90983864b13ebd12e4ec11e442aaeeef20cc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 May 2019 14:54:40 -0500 Subject: [PATCH 13/22] added type --- pandas/core/groupby/generic.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 572de0e5d9ae1..8b7f5b4632052 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -10,6 +10,8 @@ import copy from functools import partial from textwrap import dedent +import typing +from typing import Any, Callable, List, Union import warnings import numpy as np @@ -43,6 +45,8 @@ from pandas.plotting._core import boxplot_frame_groupby KeywordAgg = namedtuple("KeywordAgg", ["column", "aggfunc"]) +# TODO(typing) the return value on this callable should be any *scalar*. +AggScalar = Union[str, Callable[..., Any]] class NDFrameGroupBy(GroupBy): @@ -1644,7 +1648,7 @@ def _normalize_keyword_aggregation(kwargs): # process normally, then fixup the names. # TODO(Py35): When we drop python 3.5, change this to # defaultdict(list) - func = OrderedDict() + func = OrderedDict() # type: typing.OrderedDict[str, List[AggScalar]] order = [] columns, pairs = list(zip(*kwargs.items())) From 1da90d438c8c1739e254b457e4a0acf965de1fc5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 May 2019 15:14:44 -0500 Subject: [PATCH 14/22] docs --- doc/source/user_guide/groupby.rst | 74 ++++++++++++++++--------------- pandas/core/groupby/generic.py | 1 + 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index bcc46df5003ff..4b4c109674a0e 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -568,39 +568,6 @@ For a grouped ``DataFrame``, you can rename in a similar manner: 'mean': 'bar', 'std': 'baz'})) - -Applying different functions to DataFrame columns -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -By passing a dict to ``aggregate`` you can apply a different aggregation to the -columns of a DataFrame: - -.. ipython:: python - - grouped.agg({'C': np.sum, - 'D': lambda x: np.std(x, ddof=1)}) - -The function names can also be strings. In order for a string to be valid it -must be either implemented on GroupBy or available via :ref:`dispatching -`: - -.. ipython:: python - - grouped.agg({'C': 'sum', 'D': 'std'}) - -.. note:: - - If you pass a dict to ``aggregate``, the ordering of the output columns is - non-deterministic. If you want to be sure the output columns will be in a specific - order, you can use an ``OrderedDict``. Compare the output of the following two commands: - -.. ipython:: python - - from collections import OrderedDict - - grouped.agg({'D': 'std', 'C': 'mean'}) - grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')])) - .. _groupby.aggregate.keyword: Keyword Aggregation @@ -613,9 +580,10 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "keyword aggregation - The keywords are the *output* column names - The values are tuples whose first element is the column to select - and the second element is the function to apply to that column. Pandas - provides the ``pandas.Agg`` namedtuple with the fields ``['column', 'aggfunc']`` - to make it clearer what the arguments are. + and the second element is the aggregation to apply to that column. Pandas + provides the ``pandas.KeywordAgg`` namedtuple with the fields ``['column', 'aggfunc']`` + to make it clearer what the arguments are. As usual, the aggregation can + be a callable or a string alias. .. ipython:: python @@ -649,6 +617,40 @@ requires additional arguments, partially apply them with :meth:`functools.partia preserved. Because the indeterminate keyword ordering would result in indeterminate output column ordering, the output columns will always be sorted for Python 3.5. + +Applying different functions to DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By passing a dict to ``aggregate`` you can apply a different aggregation to the +columns of a DataFrame: + +.. ipython:: python + + grouped.agg({'C': np.sum, + 'D': lambda x: np.std(x, ddof=1)}) + +The function names can also be strings. In order for a string to be valid it +must be either implemented on GroupBy or available via :ref:`dispatching +`: + +.. ipython:: python + + grouped.agg({'C': 'sum', 'D': 'std'}) + +.. note:: + + If you pass a dict to ``aggregate``, the ordering of the output columns is + non-deterministic. If you want to be sure the output columns will be in a specific + order, you can use an ``OrderedDict``. Compare the output of the following two commands: + +.. ipython:: python + + from collections import OrderedDict + + grouped.agg({'D': 'std', 'C': 'mean'}) + grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')])) + + .. _groupby.aggregate.cython: Cython-optimized aggregation functions diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8b7f5b4632052..cfeb0f5a58add 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1634,6 +1634,7 @@ def _is_multi_agg_with_relabel(**kwargs): ... a_min=('a', 'min')) True >>> _is_multi_agg_with_relabel() + False """ return all( isinstance(v, tuple) and len(v) == 2 From 0ddd51fea64eebe9e42b726a389a8b14b4a50bf3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 21 May 2019 11:48:55 -0500 Subject: [PATCH 15/22] remove DataFrame.agg test --- pandas/core/base.py | 7 ------- pandas/tests/frame/test_apply.py | 19 ------------------- 2 files changed, 26 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index dd18f9e4e26e2..f7837c60c0b82 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -317,8 +317,6 @@ def _aggregate(self, arg, *args, **kwargs): how can be a string describe the required post-processing, or None if not required """ - from pandas.core.groupby.generic import ( - _is_multi_agg_with_relabel, _normalize_keyword_aggregation) is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False @@ -326,11 +324,6 @@ def _aggregate(self, arg, *args, **kwargs): if _axis is None: _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) - is_relabeling = arg is None and _is_multi_agg_with_relabel(**kwargs) - if is_relabeling: - arg, columns, order = _normalize_keyword_aggregation(kwargs) - args = () # TODO: test - kwargs = {} if isinstance(arg, str): return self._try_aggregate_string_function(arg, *args, diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 7aecb0f255ce0..0c09956b3f2fb 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1153,22 +1153,3 @@ def test_frequency_is_original(self, num_cols): df = DataFrame(1, index=index, columns=range(num_cols)) df.apply(lambda x: x) assert index.freq == original.freq - - -class TestKeywordAgg: - def test_basic(self): - df = pd.DataFrame({"A": [1, 2]}) - agg_funcs = ['sum', 'mean', 'min', 'max'] - result = df.agg(**{k: ("A", k) for k in agg_funcs}) - expected = df.agg(agg_funcs) - tm.assert_frame_equal(result, expected) - - result = df.agg(**{k: pd.KeywordAgg("A", k) for k in agg_funcs}) - expected = df.agg(agg_funcs) - tm.assert_frame_equal(result, expected) - - def test_multiple(self): - df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b']}) - result = df.agg(a_min=("A", np.min), b_min=("B", "min")) - expected = df.agg('min') - tm.assert_frame_equal(result, expected) From 769d7d3516ad14a8d4b9bb6f41c00b3d892a7dc9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 24 May 2019 11:58:20 -0500 Subject: [PATCH 16/22] fixups --- doc/source/whatsnew/v0.25.0.rst | 3 ++ pandas/core/base.py | 14 ++++++---- pandas/core/groupby/generic.py | 49 +++++++++++++++++++++++++-------- 3 files changed, 50 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4747c1cef4625..4b0c76f59d907 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -47,6 +47,9 @@ should be tuples where the first element is the column selection, and the second aggregation function to apply. Pandas provides the ``pandas.KeywordAgg`` namedtuple to make it clearer what the arguments to the function are, but plain tuples are accepted as well. +Keyword aggregation is the recommended replacement for the deprecated "dict-of-dicts" +approach to naming the output of column-specific aggregations (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). + See :ref:`_groupby.aggregate.keyword` for more. .. _whatsnew_0250.enhancements.other: diff --git a/pandas/core/base.py b/pandas/core/base.py index 3f59871fb5b38..d620b68ae04d7 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -340,11 +340,15 @@ def _aggregate(self, arg, *args, **kwargs): def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 - warnings.warn( - ("using a dict with renaming " - "is deprecated and will be removed in a future " - "version"), - FutureWarning, stacklevel=level) + msg = textwrap.dedent("""\ + using a dict with renaming is deprecated and will be removed + in a future version. + + For column-specific groupby renaming, use keyword aggregation + + >>> df.groupby(...).agg(name=('column', aggfunc)) + """) + warnings.warn(msg, FutureWarning, stacklevel=level) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index cfeb0f5a58add..850405ec8668f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1315,8 +1315,8 @@ class DataFrameGroupBy(NDFrameGroupBy): 1 1 2 0.590716 2 3 4 0.704907 - To control the output names with different aggregations - per column, pass supports "keyword aggregation" + To control the output names with different aggregations per column, + pandas supports "keyword aggregation" >>> df.groupby("A").agg( ... b_min=pd.KeywordAgg(column="B", aggfunc="min"), @@ -1326,10 +1326,13 @@ class DataFrameGroupBy(NDFrameGroupBy): 1 1 -1.956929 2 3 -0.322183 - The keywords are the column names, and the values should be - 2-tuples where the first element is the column selection and - the second element is the aggfunc. Pandas provides the - ``pandas.KeywordAgg`` namedtuple to clarify the meaning of the values. + - The keywords are the *output* column names + - The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. + Pandas provides the ``pandas.KeywordAgg`` namedtuple with the fields + ``['column', 'aggfunc']`` to make it clearer what the arguments are. + As usual, the aggregation can be a callable or a string alias. + See :ref:`groupby.aggregate.keyword` for more. """) @@ -1643,21 +1646,45 @@ def _is_multi_agg_with_relabel(**kwargs): def _normalize_keyword_aggregation(kwargs): + """ + Normalize user-provided "keyword aggregation" kwargs. + + Transforms from the new ``Dict[str, KeywordAgg]`` style kwargs + to the old OrderedDict[str, List[scalar]]]. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + aggspec : dict + The transformed kwargs. + columns : List[str] + The user-provided keys. + order : List[Tuple[str, str]] + Pairs of the input and output column names. + + Examples + -------- + >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) + (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) + """ if not PY36: kwargs = OrderedDict(sorted(kwargs.items())) # Normalize the aggregation functions as Dict[column, List[func]], # process normally, then fixup the names. # TODO(Py35): When we drop python 3.5, change this to # defaultdict(list) - func = OrderedDict() # type: typing.OrderedDict[str, List[AggScalar]] + aggspec = OrderedDict() # type: typing.OrderedDict[str, List[AggScalar]] order = [] columns, pairs = list(zip(*kwargs.items())) for name, (column, aggfunc) in zip(columns, pairs): - if column in func: - func[column].append(aggfunc) + if column in aggspec: + aggspec[column].append(aggfunc) else: - func[column] = [aggfunc] + aggspec[column] = [aggfunc] order.append((column, com.get_callable_name(aggfunc) or aggfunc)) - return func, columns, order + return aggspec, columns, order From 1cee0e2e1cbde585766a358cf712aafa3006dea3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 24 May 2019 12:01:48 -0500 Subject: [PATCH 17/22] trigger ci From 6369eb1bb391c794261b6bb4fdc0add1b84b70bd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 24 May 2019 15:52:36 -0500 Subject: [PATCH 18/22] KeywordAgg -> NamedAgg --- doc/source/user_guide/groupby.rst | 28 +++++++++++++------ doc/source/whatsnew/v0.25.0.rst | 12 ++++---- pandas/__init__.py | 2 +- pandas/core/api.py | 2 +- pandas/core/groupby/__init__.py | 2 +- pandas/core/groupby/generic.py | 16 +++++------ pandas/tests/api/test_api.py | 2 +- .../tests/groupby/aggregate/test_aggregate.py | 6 ++-- 8 files changed, 41 insertions(+), 29 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 4b4c109674a0e..e49bbedddfb11 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -568,20 +568,20 @@ For a grouped ``DataFrame``, you can rename in a similar manner: 'mean': 'bar', 'std': 'baz'})) -.. _groupby.aggregate.keyword: +.. _groupby.aggregate.named: -Keyword Aggregation -~~~~~~~~~~~~~~~~~~~ +Named Aggregation +~~~~~~~~~~~~~~~~~ .. versionadded:: 0.25.0 To support column-specific aggregation *with control over the output column names*, pandas -accepts the special syntax in :meth:`GroupBy.agg`, known as "keyword aggregation", where +accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", where - The keywords are the *output* column names - The values are tuples whose first element is the column to select and the second element is the aggregation to apply to that column. Pandas - provides the ``pandas.KeywordAgg`` namedtuple with the fields ``['column', 'aggfunc']`` + provides the ``pandas.NamedAgg`` namedtuple with the fields ``['column', 'aggfunc']`` to make it clearer what the arguments are. As usual, the aggregation can be a callable or a string alias. @@ -593,18 +593,30 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "keyword aggregation animals animals.groupby("kind").agg( - min_height=pd.KeywordAgg(column='height', aggfunc='min'), - max_height=pd.KeywordAgg(column='height', aggfunc='max'), + min_height=pd.NamedAgg(column='height', aggfunc='min'), + max_height=pd.NamedAgg(column='height', aggfunc='max'), + average_weight=pd.NamedAgg(column='height', aggfunc=np.mean), + ) + + +``pandas.NamedAgg`` is just a ``namedtuple``. Plain tuples are allowed as well. + +.. ipython:: python + + animals.groupby("kind").agg( + min_height=('height', 'min'), + max_height=('height', 'max'), average_weight=('height', np.mean), ) + If your desired output column names are not valid python keywords, construct a dictionary and unpack the keyword arguments .. ipython:: python animals.groupby("kind").agg(**{ - 'total weight': pd.KeywordAgg(column='weight', aggfunc=sum), + 'total weight': pd.NamedAgg(column='weight', aggfunc=sum), }) Additional keyword arguments are not passed through to the aggregation functions. Only pairs diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4b0c76f59d907..29e5f66bc882f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -27,7 +27,7 @@ Enhancements Groupby Aggregation with Relabeling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has added special groupby behavior, known as "keyword aggregation", for naming the +Pandas has added special groupby behavior, known as "named aggregation", for naming the output columns when applying multiple aggregation functions to specific columns (:issue:`18366`). .. ipython:: python @@ -37,20 +37,20 @@ output columns when applying multiple aggregation functions to specific columns 'weight': [7.9, 7.5, 9.9, 198.0]}) animals animals.groupby("kind").agg( - min_height=pd.KeywordAgg(column='height', aggfunc='min'), - max_height=pd.KeywordAgg(column='height', aggfunc='max'), - average_weight=('height', np.mean), + min_height=pd.NamedAgg(column='height', aggfunc='min'), + max_height=pd.NamedAgg(column='height', aggfunc='max'), + average_weight=pd.NamedAgg(column='height', aggfunc=np.mean), ) Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` should be tuples where the first element is the column selection, and the second element is the -aggregation function to apply. Pandas provides the ``pandas.KeywordAgg`` namedtuple to make it clearer +aggregation function to apply. Pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer what the arguments to the function are, but plain tuples are accepted as well. Keyword aggregation is the recommended replacement for the deprecated "dict-of-dicts" approach to naming the output of column-specific aggregations (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). -See :ref:`_groupby.aggregate.keyword` for more. +See :ref:`_groupby.aggregate.named` for more. .. _whatsnew_0250.enhancements.other: diff --git a/pandas/__init__.py b/pandas/__init__.py index feeac6000b35d..83b2a725aa5aa 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -65,7 +65,7 @@ to_numeric, to_datetime, to_timedelta, # misc - np, KeywordAgg, TimeGrouper, Grouper, factorize, unique, value_counts, + np, NamedAgg, TimeGrouper, Grouper, factorize, unique, value_counts, array, Categorical, set_eng_float_format, Series, DataFrame, Panel) diff --git a/pandas/core/api.py b/pandas/core/api.py index 75936a4bc6321..9f6be5ea533fe 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -21,7 +21,7 @@ DatetimeTZDtype, ) from pandas.core.arrays import Categorical, array -from pandas.core.groupby import Grouper, KeywordAgg +from pandas.core.groupby import Grouper, NamedAgg from pandas.io.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex, Float64Index, diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index e6616e835b662..fe50bd91a4f56 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,4 +1,4 @@ from pandas.core.groupby.generic import ( # noqa: F401 - DataFrameGroupBy, KeywordAgg, SeriesGroupBy) + DataFrameGroupBy, NamedAgg, SeriesGroupBy) from pandas.core.groupby.groupby import GroupBy # noqa: F401 from pandas.core.groupby.grouper import Grouper # noqa: F401 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 850405ec8668f..9f33fadcac0c3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -44,7 +44,7 @@ from pandas.plotting._core import boxplot_frame_groupby -KeywordAgg = namedtuple("KeywordAgg", ["column", "aggfunc"]) +NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) # TODO(typing) the return value on this callable should be any *scalar*. AggScalar = Union[str, Callable[..., Any]] @@ -1316,11 +1316,11 @@ class DataFrameGroupBy(NDFrameGroupBy): 2 3 4 0.704907 To control the output names with different aggregations per column, - pandas supports "keyword aggregation" + pandas supports "named aggregation" >>> df.groupby("A").agg( - ... b_min=pd.KeywordAgg(column="B", aggfunc="min"), - ... c_sum=pd.KeywordAgg(column="C", aggfunc="sum")) + ... b_min=pd.NamedAgg(column="B", aggfunc="min"), + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) b_min c_sum A 1 1 -1.956929 @@ -1329,11 +1329,11 @@ class DataFrameGroupBy(NDFrameGroupBy): - The keywords are the *output* column names - The values are tuples whose first element is the column to select and the second element is the aggregation to apply to that column. - Pandas provides the ``pandas.KeywordAgg`` namedtuple with the fields + Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields ``['column', 'aggfunc']`` to make it clearer what the arguments are. As usual, the aggregation can be a callable or a string alias. - See :ref:`groupby.aggregate.keyword` for more. + See :ref:`groupby.aggregate.named` for more. """) @Substitution(see_also=_agg_see_also_doc, @@ -1647,9 +1647,9 @@ def _is_multi_agg_with_relabel(**kwargs): def _normalize_keyword_aggregation(kwargs): """ - Normalize user-provided "keyword aggregation" kwargs. + Normalize user-provided "named aggregation" kwargs. - Transforms from the new ``Dict[str, KeywordAgg]`` style kwargs + Transforms from the new ``Dict[str, NamedAgg]`` style kwargs to the old OrderedDict[str, List[scalar]]]. Parameters diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 7af081803f482..718fd10f8681f 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -47,7 +47,7 @@ class TestPDApi(Base): 'DatetimeTZDtype', 'Int8Dtype', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', - 'KeywordAgg', + 'NamedAgg', ] # these are already deprecated; awaiting removal diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index f50334eb36c7f..9e714a1086037 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -329,7 +329,7 @@ def test_uint64_type_handling(dtype, how): tm.assert_frame_equal(result, expected, check_exact=True) -class TestKeywordAggregation: +class TestNamedAggregation: def test_agg_relabel(self): df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], @@ -419,8 +419,8 @@ def test_missing_raises(self): def test_agg_namedtuple(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) result = df.groupby("A").agg( - b=pd.KeywordAgg("B", "sum"), - c=pd.KeywordAgg(column="B", aggfunc="count") + b=pd.NamedAgg("B", "sum"), + c=pd.NamedAgg(column="B", aggfunc="count") ) expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count")) From 02d71696a1697aa57aa0f0020ac5c570dfc96241 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 24 May 2019 15:56:59 -0500 Subject: [PATCH 19/22] ordering note --- doc/source/user_guide/groupby.rst | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index e49bbedddfb11..2014dbd9865f3 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -626,9 +626,9 @@ requires additional arguments, partially apply them with :meth:`functools.partia .. note:: For Python 3.5 and earlier, the order of ``**kwargs`` in a functions was not - preserved. Because the indeterminate keyword ordering would result in indeterminate - output column ordering, the output columns will always be sorted for Python 3.5. - + preserved. This means that the output column ordering would not be + consistent. To ensure consistent ordering, the keys (and so output columns) + will always be sorted for Python 3.5. Applying different functions to DataFrame columns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -649,20 +649,6 @@ must be either implemented on GroupBy or available via :ref:`dispatching grouped.agg({'C': 'sum', 'D': 'std'}) -.. note:: - - If you pass a dict to ``aggregate``, the ordering of the output columns is - non-deterministic. If you want to be sure the output columns will be in a specific - order, you can use an ``OrderedDict``. Compare the output of the following two commands: - -.. ipython:: python - - from collections import OrderedDict - - grouped.agg({'D': 'std', 'C': 'mean'}) - grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')])) - - .. _groupby.aggregate.cython: Cython-optimized aggregation functions From 7df14d7d7fcd45d65ecfd84d2d0576c238383b8e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 28 May 2019 09:36:30 -0500 Subject: [PATCH 20/22] fixups --- doc/source/reference/groupby.rst | 3 ++- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/base.py | 2 +- pandas/core/groupby/generic.py | 1 + pandas/tests/groupby/aggregate/test_other.py | 1 + 5 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index 5c8a563a47d00..4f5c9e5d712ee 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -27,7 +27,8 @@ Indexing, iteration Grouper -.. currentmodule:: pandas.core.groupby +.. + Function application -------------------- diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4513178a58b8e..22a29ec31ac45 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -47,7 +47,7 @@ should be tuples where the first element is the column selection, and the second aggregation function to apply. Pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer what the arguments to the function are, but plain tuples are accepted as well. -Keyword aggregation is the recommended replacement for the deprecated "dict-of-dicts" +Named aggregation is the recommended replacement for the deprecated "dict-of-dicts" approach to naming the output of column-specific aggregations (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). See :ref:`_groupby.aggregate.named` for more. diff --git a/pandas/core/base.py b/pandas/core/base.py index d620b68ae04d7..e4274e48d3227 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -344,7 +344,7 @@ def nested_renaming_depr(level=4): using a dict with renaming is deprecated and will be removed in a future version. - For column-specific groupby renaming, use keyword aggregation + For column-specific groupby renaming, use named aggregation >>> df.groupby(...).agg(name=('column', aggfunc)) """) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9f33fadcac0c3..faa4d868bb65a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1672,6 +1672,7 @@ def _normalize_keyword_aggregation(kwargs): """ if not PY36: kwargs = OrderedDict(sorted(kwargs.items())) + # Normalize the aggregation functions as Dict[column, List[func]], # process normally, then fixup the names. # TODO(Py35): When we drop python 3.5, change this to diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 02d8c09bf2c8f..8168cf06ffdb1 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -217,6 +217,7 @@ def test_agg_dict_renaming_deprecation(): df.groupby('A').agg({'B': {'foo': ['sum', 'max']}, 'C': {'bar': ['count', 'min']}}) assert "using a dict with renaming" in str(w[0].message) + assert "named aggregation" in str(w[0].message) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df.groupby('A')[['B', 'C']].agg({'ma': 'max'}) From cf8db511211cfcba051a573a211b4505ce5bbba3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 28 May 2019 09:37:52 -0500 Subject: [PATCH 21/22] plain --- doc/source/whatsnew/v0.25.0.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 22a29ec31ac45..921dd5405538c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -47,6 +47,14 @@ should be tuples where the first element is the column selection, and the second aggregation function to apply. Pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer what the arguments to the function are, but plain tuples are accepted as well. +.. ipython:: python + + animals.groupby("kind").agg( + min_height=('height', 'min'), + max_height=('height', 'max'), + average_weight=('height', np.mean), + ) + Named aggregation is the recommended replacement for the deprecated "dict-of-dicts" approach to naming the output of column-specific aggregations (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). From 25dca1ae55918302e6541f5650ced279a48af381 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 May 2019 08:32:31 -0500 Subject: [PATCH 22/22] fixups --- doc/source/reference/groupby.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index 4f5c9e5d712ee..5c8a563a47d00 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -27,8 +27,7 @@ Indexing, iteration Grouper -.. - +.. currentmodule:: pandas.core.groupby Function application --------------------