Add GroupBy.pipe method (pandas-dev#17871)

topper-123 · alanbato · commit f927c5035fe6 · 2017-11-10T17:20:53.000-06:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -1973,6 +1973,7 @@ Function application
    GroupBy.apply
    GroupBy.aggregate
    GroupBy.transform
+   GroupBy.pipe
 
 Computations / Descriptive Stats
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -1165,6 +1165,55 @@ See the :ref:`visualization documentation<visualization.box>` for more.
   to ``df.boxplot(by="g")``. See :ref:`here<visualization.box.return>` for
   an explanation.
 
+.. _groupby.pipe:
+
+Piping function calls
+~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.21.0
+
+Similar to the functionality provided by ``DataFrame`` and ``Series``, functions
+that take ``GroupBy`` objects can be chained together using a ``pipe`` method to
+allow for a cleaner, more readable syntax. To read about ``.pipe`` in general terms,
+see :ref:`here <basics.pipe>`.
+
+Combining ``.groupby`` and ``.pipe`` is often useful when you need to reuse
+GroupB objects.
+
+For an example, imagine having a DataFrame with columns for stores, products,
+revenue and sold quantity. We'd like to do a groupwise calculation of *prices*
+(i.e. revenue/quantity) per store and per product. We could do this in a
+multi-step operation, but expressing it in terms of piping can make the
+code more readable. First we set the data:
+
+.. ipython:: python
+
+   import numpy as np
+   n = 1000
+   df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n),
+                      'Product': np.random.choice(['Product_1', 'Product_2', 'Product_3'], n),
+                      'Revenue': (np.random.random(n)*50+10).round(2),
+                      'Quantity': np.random.randint(1, 10, size=n)})
+   df.head(2)
+
+Now, to find prices per store/product, we can simply do:
+
+.. ipython:: python
+
+   (df.groupby(['Store', 'Product'])
+      .pipe(lambda grp: grp.Revenue.sum()/grp.Quantity.sum())
+      .unstack().round(2))
+
+Piping can also be expressive when you want to deliver a grouped object to some
+arbitrary function, for example:
+
+.. code-block:: python
+
+   (df.groupby(['Store', 'Product']).pipe(report_func)
+
+where ``report_func`` takes a GroupBy object and creates a report
+from that.
+
 Examples
 --------
 
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -14,6 +14,8 @@ Highlights include:
   categoricals independent of the data, see :ref:`here <whatsnew_0210.enhancements.categorical_dtype>`.
 - The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck <http://berkeleyanalytics.com/bottleneck>`__ is installed, see :ref:`here <whatsnew_0210.api_breaking.bottleneck>`
 - Compatibility fixes for pypy, see :ref:`here <whatsnew_0210.pypy>`.
+- ``GroupBy`` objects now have a ``pipe`` method, similar to the one on ``DataFrame`` and ``Series``.
+  This allows for functions that take a ``GroupBy`` to be composed in a clean, readable syntax, see :ref:`here <whatsnew_0210.enhancements.GroupBy_pipe>`.
 
 Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations <whatsnew_0210.deprecations>` before updating.
 
@@ -202,6 +204,43 @@ still the string ``'category'``. We'll take this moment to remind users that the
 
 See the :ref:`CategoricalDtype docs <categorical.categoricaldtype>` for more.
 
+.. _whatsnew_0210.enhancements.GroupBy_pipe:
+
+``GroupBy`` objects now have a ``pipe`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``GroupBy`` objects now have a ``pipe`` method, similar to the one on
+``DataFrame`` and ``Series``, that allow for functions that take a
+``GroupBy`` to be composed in a clean, readable syntax. (:issue:`17871`)
+
+For a concrete example on combining ``.groupby`` and ``.pipe`` , imagine having a
+DataFrame with columns for stores, products, revenue and sold quantity. We'd like to
+do a groupwise calculation of *prices* (i.e. revenue/quantity) per store and per product.
+We could do this in a multi-step operation, but expressing it in terms of piping can make the
+code more readable.
+
+First we set the data:
+
+.. ipython:: python
+
+   import numpy as np
+   n = 1000
+   df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n),
+                      'Product': np.random.choice(['Product_1', 'Product_2', 'Product_3'], n),
+                      'Revenue': (np.random.random(n)*50+10).round(2),
+                      'Quantity': np.random.randint(1, 10, size=n)})
+   df.head(2)
+
+Now, to find prices per store/product, we can simply do:
+
+.. ipython:: python
+
+   (df.groupby(['Store', 'Product'])
+      .pipe(lambda grp: grp.Revenue.sum()/grp.Quantity.sum())
+      .unstack().round(2))
+
+See the :ref:`documentation <groupby.pipe>` for more.
+
 .. _whatsnew_0210.enhancements.other:
 
 Other Enhancements
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -664,3 +664,38 @@ def _get_distinct_objs(objs):
             ids.add(id(obj))
             res.append(obj)
     return res
+
+
+def _pipe(obj, func, *args, **kwargs):
+    """
+    Apply a function ``func`` to object ``obj`` either by passing obj as the
+    first argument to the function or, in the case that the func is a tuple,
+    interpret the first element of the tuple as a function and pass the obj to
+    that function as a keyword argument whose key is the value of the second
+    element of the tuple.
+
+    Parameters
+    ----------
+    func : callable or tuple of (callable, string)
+        Function to apply to this object or, alternatively, a
+        ``(callable, data_keyword)`` tuple where ``data_keyword`` is a
+        string indicating the keyword of `callable`` that expects the
+        object.
+    args : iterable, optional
+        positional arguments passed into ``func``.
+    kwargs : dict, optional
+        a dictionary of keyword arguments passed into ``func``.
+
+    Returns
+    -------
+    object : the return type of ``func``.
+    """
+    if isinstance(func, tuple):
+        func, target = func
+        if target in kwargs:
+            msg = '%s is both the pipe target and a keyword argument' % target
+            raise ValueError(msg)
+        kwargs[target] = obj
+        return func(*args, **kwargs)
+    else:
+        return func(obj, *args, **kwargs)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3497,8 +3497,10 @@ def sample(self, n=None, frac=None, replace=False, weights=None,
             Alternatively a ``(callable, data_keyword)`` tuple where
             ``data_keyword`` is a string indicating the keyword of
             ``callable`` that expects the %(klass)s.
-        args : positional arguments passed into ``func``.
-        kwargs : a dictionary of keyword arguments passed into ``func``.
+        args : iterable, optional
+            positional arguments passed into ``func``.
+        kwargs : mapping, optional
+            a dictionary of keyword arguments passed into ``func``.
 
         Returns
         -------
@@ -3508,7 +3510,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None,
         -----
 
         Use ``.pipe`` when chaining together functions that expect
-        on Series or DataFrames. Instead of writing
+        Series, DataFrames or GroupBy objects. Instead of writing
 
         >>> f(g(h(df), arg1=a), arg2=b, arg3=c)
 
@@ -3537,15 +3539,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None,
 
     @Appender(_shared_docs['pipe'] % _shared_doc_kwargs)
     def pipe(self, func, *args, **kwargs):
-        if isinstance(func, tuple):
-            func, target = func
-            if target in kwargs:
-                raise ValueError('%s is both the pipe target and a keyword '
-                                 'argument' % target)
-            kwargs[target] = self
-            return func(*args, **kwargs)
-        else:
-            return func(self, *args, **kwargs)
+        return com._pipe(self, func, *args, **kwargs)
 
     _shared_docs['aggregate'] = ("""
     Aggregate using callable, string, dict, or list of string/callables
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -40,7 +40,7 @@
 
 from pandas.core.common import (_values_from_object, AbstractMethodError,
                                 _default_index, _not_none, _get_callable_name,
-                                _asarray_tuplesafe)
+                                _asarray_tuplesafe, _pipe)
 
 from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
                               DataError, SpecificationError)
@@ -656,9 +656,10 @@ def __iter__(self):
     @Substitution(name='groupby')
     def apply(self, func, *args, **kwargs):
         """
-        Apply function and combine results together in an intelligent way. The
-        split-apply-combine combination rules attempt to be as common sense
-        based as possible. For example:
+        Apply function and combine results together in an intelligent way.
+
+        The split-apply-combine combination rules attempt to be as common
+        sense based as possible. For example:
 
         case 1:
         group DataFrame
@@ -692,7 +693,10 @@ def apply(self, func, *args, **kwargs):
 
         See also
         --------
-        aggregate, transform"""
+        pipe : Apply function to the full GroupBy object instead of to each
+            group.
+        aggregate, transform
+        """
 
         func = self._is_builtin_func(func)
 
@@ -1691,6 +1695,54 @@ def tail(self, n=5):
         mask = self._cumcount_array(ascending=False) < n
         return self._selected_obj[mask]
 
+    def pipe(self, func, *args, **kwargs):
+        """ Apply a function with arguments to this GroupBy object,
+
+        .. versionadded:: 0.21.0
+
+        Parameters
+        ----------
+        func : callable or tuple of (callable, string)
+            Function to apply to this GroupBy object or, alternatively, a
+            ``(callable, data_keyword)`` tuple where ``data_keyword`` is a
+            string indicating the keyword of ``callable`` that expects the
+            GroupBy object.
+        args : iterable, optional
+               positional arguments passed into ``func``.
+        kwargs : dict, optional
+                 a dictionary of keyword arguments passed into ``func``.
+
+        Returns
+        -------
+        object : the return type of ``func``.
+
+        Notes
+        -----
+        Use ``.pipe`` when chaining together functions that expect
+        Series, DataFrames or GroupBy objects. Instead of writing
+
+        >>> f(g(h(df.groupby('group')), arg1=a), arg2=b, arg3=c)
+
+        You can write
+
+        >>> (df
+        ...    .groupby('group')
+        ...    .pipe(f, arg1)
+        ...    .pipe(g, arg2)
+        ...    .pipe(h, arg3))
+
+        See more `here
+        <http://pandas.pydata.org/pandas-docs/stable/groupby.html#pipe>`_
+
+        See Also
+        --------
+        pandas.Series.pipe : Apply a function with arguments to a series
+        pandas.DataFrame.pipe: Apply a function with arguments to a dataframe
+        apply : Apply function to each group instead of to the
+            full GroupBy object.
+        """
+        return _pipe(self, func, *args, **kwargs)
+
 
 GroupBy._add_numeric_operations()
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -3762,6 +3762,75 @@ def test_gb_key_len_equal_axis_len(self):
             assert df.loc[('foo', 'bar', 'B')] == 2
             assert df.loc[('foo', 'baz', 'C')] == 1
 
+    def test_pipe(self):
+        # Test the pipe method of DataFrameGroupBy.
+        # Issue #17871
+
+        random_state = np.random.RandomState(1234567890)
+
+        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+                              'foo', 'bar', 'foo', 'foo'],
+                        'B': random_state.randn(8),
+                        'C': random_state.randn(8)})
+
+        def f(dfgb):
+            return dfgb.B.max() - dfgb.C.min().min()
+
+        def square(srs):
+            return srs ** 2
+
+        # Note that the transformations are
+        # GroupBy -> Series
+        # Series -> Series
+        # This then chains the GroupBy.pipe and the
+        # NDFrame.pipe methods
+        result = df.groupby('A').pipe(f).pipe(square)
+
+        index = Index([u'bar', u'foo'], dtype='object', name=u'A')
+        expected = pd.Series([8.99110003361, 8.17516964785], name='B',
+                             index=index)
+
+        assert_series_equal(expected, result)
+
+    def test_pipe_args(self):
+        # Test passing args to the pipe method of DataFrameGroupBy.
+        # Issue #17871
+
+        df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'],
+                           'x': [1.0, 2.0, 3.0, 2.0, 5.0],
+                           'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]})
+
+        def f(dfgb, arg1):
+            return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
+                        .groupby(dfgb.grouper))
+
+        def g(dfgb, arg2):
+            return dfgb.sum() / dfgb.sum().sum() + arg2
+
+        def h(df, arg3):
+            return df.x + df.y - arg3
+
+        result = (df
+                  .groupby('group')
+                  .pipe(f, 0)
+                  .pipe(g, 10)
+                  .pipe(h, 100))
+
+        # Assert the results here
+        index = pd.Index(['A', 'B', 'C'], name='group')
+        expected = pd.Series([-79.5160891089, -78.4839108911, None],
+                             index=index)
+
+        assert_series_equal(expected, result)
+
+        # test SeriesGroupby.pipe
+        ser = pd.Series([1, 1, 2, 2, 3, 3])
+        result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
+
+        expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3]))
+
+        assert_series_equal(result, expected)
+
 
 def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
     tups = lmap(tuple, df[keys].values)
diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py
@@ -239,17 +239,17 @@ def test_groupby_blacklist(df_letters):
 def test_tab_completion(mframe):
     grp = mframe.groupby(level='second')
     results = set([v for v in dir(grp) if not v.startswith('_')])
-    expected = set(
-        ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter',
-         'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max',
-         'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot',
-         'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',
-         'nunique', 'head', 'describe', 'cummax', 'quantile',
-         'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna',
-         'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew',
-         'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
-         'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
-         'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding'])
+    expected = {
+        'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter',
+        'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max',
+        'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot',
+        'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',
+        'nunique', 'head', 'describe', 'cummax', 'quantile',
+        'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna',
+        'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew',
+        'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
+        'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
+        'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe'}
     assert results == expected