From 608a0e48b1481180d59fbb7f8cb5c0bfa15f1257 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 14 Oct 2017 20:43:15 +0100 Subject: [PATCH] Add GroupBy.pipe method --- doc/source/api.rst | 1 + doc/source/groupby.rst | 49 ++++++++++++++++++ doc/source/whatsnew/v0.21.0.txt | 39 +++++++++++++++ pandas/core/common.py | 35 +++++++++++++ pandas/core/generic.py | 18 +++---- pandas/core/groupby.py | 62 +++++++++++++++++++++-- pandas/tests/groupby/test_groupby.py | 69 ++++++++++++++++++++++++++ pandas/tests/groupby/test_whitelist.py | 22 ++++---- 8 files changed, 267 insertions(+), 28 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 1e63a938ff389..bec35bac0d33e 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1973,6 +1973,7 @@ Function application GroupBy.apply GroupBy.aggregate GroupBy.transform + GroupBy.pipe Computations / Descriptive Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 175ea28122606..316244b583aa2 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1165,6 +1165,55 @@ See the :ref:`visualization documentation` for more. to ``df.boxplot(by="g")``. See :ref:`here` for an explanation. +.. _groupby.pipe: + +Piping function calls +~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.21.0 + +Similar to the functionality provided by ``DataFrame`` and ``Series``, functions +that take ``GroupBy`` objects can be chained together using a ``pipe`` method to +allow for a cleaner, more readable syntax. To read about ``.pipe`` in general terms, +see :ref:`here `. + +Combining ``.groupby`` and ``.pipe`` is often useful when you need to reuse +GroupB objects. + +For an example, imagine having a DataFrame with columns for stores, products, +revenue and sold quantity. We'd like to do a groupwise calculation of *prices* +(i.e. revenue/quantity) per store and per product. We could do this in a +multi-step operation, but expressing it in terms of piping can make the +code more readable. First we set the data: + +.. ipython:: python + + import numpy as np + n = 1000 + df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), + 'Product': np.random.choice(['Product_1', 'Product_2', 'Product_3'], n), + 'Revenue': (np.random.random(n)*50+10).round(2), + 'Quantity': np.random.randint(1, 10, size=n)}) + df.head(2) + +Now, to find prices per store/product, we can simply do: + +.. ipython:: python + + (df.groupby(['Store', 'Product']) + .pipe(lambda grp: grp.Revenue.sum()/grp.Quantity.sum()) + .unstack().round(2)) + +Piping can also be expressive when you want to deliver a grouped object to some +arbitrary function, for example: + +.. code-block:: python + + (df.groupby(['Store', 'Product']).pipe(report_func) + +where ``report_func`` takes a GroupBy object and creates a report +from that. + Examples -------- diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 5577089c776ed..0e232fc2e6c15 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -14,6 +14,8 @@ Highlights include: categoricals independent of the data, see :ref:`here `. - The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, see :ref:`here ` - Compatibility fixes for pypy, see :ref:`here `. +- ``GroupBy`` objects now have a ``pipe`` method, similar to the one on ``DataFrame`` and ``Series``. + This allows for functions that take a ``GroupBy`` to be composed in a clean, readable syntax, see :ref:`here `. Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -202,6 +204,43 @@ still the string ``'category'``. We'll take this moment to remind users that the See the :ref:`CategoricalDtype docs ` for more. +.. _whatsnew_0210.enhancements.GroupBy_pipe: + +``GroupBy`` objects now have a ``pipe`` method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``GroupBy`` objects now have a ``pipe`` method, similar to the one on +``DataFrame`` and ``Series``, that allow for functions that take a +``GroupBy`` to be composed in a clean, readable syntax. (:issue:`17871`) + +For a concrete example on combining ``.groupby`` and ``.pipe`` , imagine having a +DataFrame with columns for stores, products, revenue and sold quantity. We'd like to +do a groupwise calculation of *prices* (i.e. revenue/quantity) per store and per product. +We could do this in a multi-step operation, but expressing it in terms of piping can make the +code more readable. + +First we set the data: + +.. ipython:: python + + import numpy as np + n = 1000 + df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), + 'Product': np.random.choice(['Product_1', 'Product_2', 'Product_3'], n), + 'Revenue': (np.random.random(n)*50+10).round(2), + 'Quantity': np.random.randint(1, 10, size=n)}) + df.head(2) + +Now, to find prices per store/product, we can simply do: + +.. ipython:: python + + (df.groupby(['Store', 'Product']) + .pipe(lambda grp: grp.Revenue.sum()/grp.Quantity.sum()) + .unstack().round(2)) + +See the :ref:`documentation ` for more. + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/common.py b/pandas/core/common.py index 7b96700313012..29d278b6efcb0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -664,3 +664,38 @@ def _get_distinct_objs(objs): ids.add(id(obj)) res.append(obj) return res + + +def _pipe(obj, func, *args, **kwargs): + """ + Apply a function ``func`` to object ``obj`` either by passing obj as the + first argument to the function or, in the case that the func is a tuple, + interpret the first element of the tuple as a function and pass the obj to + that function as a keyword argument whose key is the value of the second + element of the tuple. + + Parameters + ---------- + func : callable or tuple of (callable, string) + Function to apply to this object or, alternatively, a + ``(callable, data_keyword)`` tuple where ``data_keyword`` is a + string indicating the keyword of `callable`` that expects the + object. + args : iterable, optional + positional arguments passed into ``func``. + kwargs : dict, optional + a dictionary of keyword arguments passed into ``func``. + + Returns + ------- + object : the return type of ``func``. + """ + if isinstance(func, tuple): + func, target = func + if target in kwargs: + msg = '%s is both the pipe target and a keyword argument' % target + raise ValueError(msg) + kwargs[target] = obj + return func(*args, **kwargs) + else: + return func(obj, *args, **kwargs) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5a311afc27c9a..35a26702ad15a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3497,8 +3497,10 @@ def sample(self, n=None, frac=None, replace=False, weights=None, Alternatively a ``(callable, data_keyword)`` tuple where ``data_keyword`` is a string indicating the keyword of ``callable`` that expects the %(klass)s. - args : positional arguments passed into ``func``. - kwargs : a dictionary of keyword arguments passed into ``func``. + args : iterable, optional + positional arguments passed into ``func``. + kwargs : mapping, optional + a dictionary of keyword arguments passed into ``func``. Returns ------- @@ -3508,7 +3510,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, ----- Use ``.pipe`` when chaining together functions that expect - on Series or DataFrames. Instead of writing + Series, DataFrames or GroupBy objects. Instead of writing >>> f(g(h(df), arg1=a), arg2=b, arg3=c) @@ -3537,15 +3539,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, @Appender(_shared_docs['pipe'] % _shared_doc_kwargs) def pipe(self, func, *args, **kwargs): - if isinstance(func, tuple): - func, target = func - if target in kwargs: - raise ValueError('%s is both the pipe target and a keyword ' - 'argument' % target) - kwargs[target] = self - return func(*args, **kwargs) - else: - return func(self, *args, **kwargs) + return com._pipe(self, func, *args, **kwargs) _shared_docs['aggregate'] = (""" Aggregate using callable, string, dict, or list of string/callables diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f13804f347c9f..5c07033f5a68f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -40,7 +40,7 @@ from pandas.core.common import (_values_from_object, AbstractMethodError, _default_index, _not_none, _get_callable_name, - _asarray_tuplesafe) + _asarray_tuplesafe, _pipe) from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) @@ -656,9 +656,10 @@ def __iter__(self): @Substitution(name='groupby') def apply(self, func, *args, **kwargs): """ - Apply function and combine results together in an intelligent way. The - split-apply-combine combination rules attempt to be as common sense - based as possible. For example: + Apply function and combine results together in an intelligent way. + + The split-apply-combine combination rules attempt to be as common + sense based as possible. For example: case 1: group DataFrame @@ -692,7 +693,10 @@ def apply(self, func, *args, **kwargs): See also -------- - aggregate, transform""" + pipe : Apply function to the full GroupBy object instead of to each + group. + aggregate, transform + """ func = self._is_builtin_func(func) @@ -1691,6 +1695,54 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] + def pipe(self, func, *args, **kwargs): + """ Apply a function with arguments to this GroupBy object, + + .. versionadded:: 0.21.0 + + Parameters + ---------- + func : callable or tuple of (callable, string) + Function to apply to this GroupBy object or, alternatively, a + ``(callable, data_keyword)`` tuple where ``data_keyword`` is a + string indicating the keyword of ``callable`` that expects the + GroupBy object. + args : iterable, optional + positional arguments passed into ``func``. + kwargs : dict, optional + a dictionary of keyword arguments passed into ``func``. + + Returns + ------- + object : the return type of ``func``. + + Notes + ----- + Use ``.pipe`` when chaining together functions that expect + Series, DataFrames or GroupBy objects. Instead of writing + + >>> f(g(h(df.groupby('group')), arg1=a), arg2=b, arg3=c) + + You can write + + >>> (df + ... .groupby('group') + ... .pipe(f, arg1) + ... .pipe(g, arg2) + ... .pipe(h, arg3)) + + See more `here + `_ + + See Also + -------- + pandas.Series.pipe : Apply a function with arguments to a series + pandas.DataFrame.pipe: Apply a function with arguments to a dataframe + apply : Apply function to each group instead of to the + full GroupBy object. + """ + return _pipe(self, func, *args, **kwargs) + GroupBy._add_numeric_operations() diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 740526e262d16..9d25117fbd954 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3762,6 +3762,75 @@ def test_gb_key_len_equal_axis_len(self): assert df.loc[('foo', 'bar', 'B')] == 2 assert df.loc[('foo', 'baz', 'C')] == 1 + def test_pipe(self): + # Test the pipe method of DataFrameGroupBy. + # Issue #17871 + + random_state = np.random.RandomState(1234567890) + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': random_state.randn(8), + 'C': random_state.randn(8)}) + + def f(dfgb): + return dfgb.B.max() - dfgb.C.min().min() + + def square(srs): + return srs ** 2 + + # Note that the transformations are + # GroupBy -> Series + # Series -> Series + # This then chains the GroupBy.pipe and the + # NDFrame.pipe methods + result = df.groupby('A').pipe(f).pipe(square) + + index = Index([u'bar', u'foo'], dtype='object', name=u'A') + expected = pd.Series([8.99110003361, 8.17516964785], name='B', + index=index) + + assert_series_equal(expected, result) + + def test_pipe_args(self): + # Test passing args to the pipe method of DataFrameGroupBy. + # Issue #17871 + + df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'], + 'x': [1.0, 2.0, 3.0, 2.0, 5.0], + 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]}) + + def f(dfgb, arg1): + return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False) + .groupby(dfgb.grouper)) + + def g(dfgb, arg2): + return dfgb.sum() / dfgb.sum().sum() + arg2 + + def h(df, arg3): + return df.x + df.y - arg3 + + result = (df + .groupby('group') + .pipe(f, 0) + .pipe(g, 10) + .pipe(h, 100)) + + # Assert the results here + index = pd.Index(['A', 'B', 'C'], name='group') + expected = pd.Series([-79.5160891089, -78.4839108911, None], + index=index) + + assert_series_equal(expected, result) + + # test SeriesGroupby.pipe + ser = pd.Series([1, 1, 2, 2, 3, 3]) + result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) + + expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) + + assert_series_equal(result, expected) + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 259f466316c41..e8e2150558edb 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -239,17 +239,17 @@ def test_groupby_blacklist(df_letters): def test_tab_completion(mframe): grp = mframe.groupby(level='second') results = set([v for v in dir(grp) if not v.startswith('_')]) - expected = set( - ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', - 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', - 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', - 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', - 'nunique', 'head', 'describe', 'cummax', 'quantile', - 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', - 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', - 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', - 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', - 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding']) + expected = { + 'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', + 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', + 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', + 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', + 'nunique', 'head', 'describe', 'cummax', 'quantile', + 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', + 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', + 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', + 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', + 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe'} assert results == expected