add GroupBy.pipe method

tp · tp · commit 9173deb7f49c · 2017-10-14T20:59:33.000+01:00
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -1165,6 +1165,56 @@ See the :ref:`visualization documentation<visualization.box>` for more.
   to ``df.boxplot(by="g")``. See :ref:`here<visualization.box.return>` for
   an explanation.
 
+.. _groupby.pipe:
+
+Piping function calls
+~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.21.0
+
+Similar to the functionality provided by ``DataFrames`` and ``Series``, functions
+that take ``GroupBy`` objects can be chained together using a ``pipe`` method to
+allow for a cleaner, more readable syntax. To read about ``.pipe`` in general terms,
+see :ref:`here <basics.pipe>`.
+
+For a concrete example on combining ``.groupby`` and ``.pipe`` , imagine have a
+DataFrame with columns for stores, products, revenue and sold quantity. We'd like to
+do a groupwise calculation of *prices* (i.e. revenue/quantity per store and per product).
+We could do this in a multi-step operation, but expressing it in terms of piping can make the
+code more readable.
+
+First we set the data:
+
+.. ipython:: python
+
+    from numpy.random import choice, random
+   n = 1000
+   df = pd.DataFrame({'Store': choice(['Store_1', 'Store_2'], n),
+                      'Product': choice(['Product_1', 'Product_2', 'Product_3'], n),
+                      'Revenue': (np.random.random(n)*50+10).round(2),
+                      'Quantity': np.random.randint(1, 10, size=n)})
+   df.head(2)
+
+Now, to find prices per store/product, we can simply do:
+
+.. ipython:: python
+
+   (df.groupby(['Store', 'Product'])
+      .pipe(lambda grp: grp.Revenue.sum()/grp.Quantity.sum())
+      .unstack().round(2))
+
+Piping can also be expressive when you want to deliver a grouped object to some
+arbitrary function, for example:
+
+.. code-block:: python
+
+   (base_df.pipe(lambda x: x[x.A>3])
+           .groupby(['Store', 'Product'])
+           .pipe(rapport_func)
+
+where ``rapport_func`` take an arbitrary GroupBy object and create a rapport
+from that.
+
 Examples
 --------
 
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -234,6 +234,9 @@ Other Enhancements
 - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names. (:issue:`14207`)
 - Improved the import time of pandas by about 2.25x.  (:issue:`16764`)
 - :func:`read_json` and :func:`to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`)
+- ``GroupBy`` objects now have a ``pipe`` method, similar to the one on ``DataFrame`` and ``Series``
+  that allow for functions that take a ``GroupBy`` to be composed in a clean, readable syntax.
+  See the :ref:`documentation <groupby.pipe>` for more.
 
 .. _whatsnew_0210.api_breaking:
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -664,3 +664,36 @@ def _get_distinct_objs(objs):
             ids.add(id(obj))
             res.append(obj)
     return res
+
+
+def _pipe(obj, func, *args, **kwargs):
+    """
+    Apply a function ``func`` to a obj either by passing the obj as the first
+    argument to the function or, in the case that the func is a tuple,
+    interpret the first element of the tuple as a function and pass the obj to
+    that function as a keyword argument whose key is the value of the second
+    element of the tuple.
+
+    func : callable or tuple of (callable, string)
+           Function to apply to this GroupBy or, alternatively, a
+           ``(callable, data_keyword)`` tuple where ``data_keyword`` is a
+           string indicating the keyword of `callable`` that expects the
+           GroupBy object.
+    args : iterable, optional
+           positional arguments passed into ``func``.
+    kwargs : dict, optional
+             a dictionary of keyword arguments passed into ``func``.
+
+    Returns
+    -------
+    object : the return type of ``func``.
+    """
+    if isinstance(func, tuple):
+        func, target = func
+        if target in kwargs:
+            msg = '%s is both the pipe target and a keyword argument' % target
+            raise ValueError(msg)
+        kwargs[target] = obj
+        return func(*args, **kwargs)
+    else:
+        return func(obj, *args, **kwargs)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3482,8 +3482,10 @@ def sample(self, n=None, frac=None, replace=False, weights=None,
             Alternatively a ``(callable, data_keyword)`` tuple where
             ``data_keyword`` is a string indicating the keyword of
             ``callable`` that expects the %(klass)s.
-        args : positional arguments passed into ``func``.
-        kwargs : a dictionary of keyword arguments passed into ``func``.
+        args : iterable, optional
+            positional arguments passed into ``func``.
+        kwargs : mapping, optional
+            a dictionary of keyword arguments passed into ``func``.
 
         Returns
         -------
@@ -3493,7 +3495,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None,
         -----
 
         Use ``.pipe`` when chaining together functions that expect
-        on Series or DataFrames. Instead of writing
+        Series, DataFrames or GroupBys. Instead of writing
 
         >>> f(g(h(df), arg1=a), arg2=b, arg3=c)
 
@@ -3522,15 +3524,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None,
 
     @Appender(_shared_docs['pipe'] % _shared_doc_kwargs)
     def pipe(self, func, *args, **kwargs):
-        if isinstance(func, tuple):
-            func, target = func
-            if target in kwargs:
-                raise ValueError('%s is both the pipe target and a keyword '
-                                 'argument' % target)
-            kwargs[target] = self
-            return func(*args, **kwargs)
-        else:
-            return func(self, *args, **kwargs)
+        return com._pipe(self, func, *args, **kwargs)
 
     _shared_docs['aggregate'] = ("""
     Aggregate using callable, string, dict, or list of string/callables
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -40,7 +40,7 @@
 
 from pandas.core.common import (_values_from_object, AbstractMethodError,
                                 _default_index, _not_none, _get_callable_name,
-                                _asarray_tuplesafe)
+                                _asarray_tuplesafe, _pipe)
 
 from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
                               DataError, SpecificationError)
@@ -1691,6 +1691,54 @@ def tail(self, n=5):
         mask = self._cumcount_array(ascending=False) < n
         return self._selected_obj[mask]
 
+    def pipe(self, func, *args, **kwargs):
+        """ Apply a function with arguments to this GroupBy object
+
+        .. versionadded:: 0.21.0
+
+        Parameters
+        ----------
+        func : callable or tuple of (callable, string)
+               Function to apply to this GroupBy or, alternatively, a
+               ``(callable, data_keyword)`` tuple where ``data_keyword`` is a
+               string indicating the keyword of `callable`` that expects the
+               GroupBy object.
+        args : iterable, optional
+               positional arguments passed into ``func``.
+        kwargs : dict, optional
+                 a dictionary of keyword arguments passed into ``func``.
+
+        Returns
+        -------
+        object : the return type of ``func``.
+
+        Notes
+        -----
+        Use ``.pipe`` when chaining together functions that expect
+        Series, DataFrames or GroupBys. Instead of writing
+
+        >>> f(g(h(df.groupby('group')), arg1=a), arg2=b, arg3=c)
+
+        You can write
+
+        >>> (df
+        ...    .groupby('group')
+        ...    .pipe(f, arg1)
+        ...    .pipe(g, arg2)
+        ...    .pipe(h, arg3))
+
+        See more :ref:`here
+        <http://pandas.pydata.org/pandas-docs/stable/groupby.html#pipe>`
+
+        See Also
+        --------
+        pandas.Series.pipe
+        pandas.DataFrame.pipe
+        pandas.GroupBy.apply
+        """
+        return _pipe(self, func, *args, **kwargs)
+
+
 
 GroupBy._add_numeric_operations()
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -3914,6 +3914,64 @@ def test_gb_key_len_equal_axis_len(self):
             assert df.loc[('foo', 'bar', 'B')] == 2
             assert df.loc[('foo', 'baz', 'C')] == 1
 
+    def test_pipe(self):
+        # Test the pipe method of DataFrameGroupBy.
+        # Issue #17871
+
+        random_state = np.random.RandomState(1234567890)
+
+        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+                              'foo', 'bar', 'foo', 'foo'],
+                        'B': random_state.randn(8),
+                        'C': random_state.randn(8)})
+
+        def f(dfgb):
+            return dfgb.B.max() - dfgb.C.min().min()
+
+        def square(srs):
+            return srs ** 2
+
+        # Note that the transformations are
+        # GroupBy -> Series
+        # Series -> Series
+        # This then chains the GroupBy.pipe and the
+        # NDFrame.pipe methods
+        result = df.groupby('A').pipe(f).pipe(square)
+
+        index = Index([u'bar', u'foo'], dtype='object', name=u'A')
+        expected = pd.Series([8.99110003361, 8.17516964785], name='B', index=index)
+
+        assert_series_equal(expected, result)
+
+    def test_pipe_args(self):
+        # Test passing args to the pipe method of DataFrameGroupBy.
+        # Issue #17871
+
+        df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'],
+                           'x': [1.0, 2.0, 3.0, 2.0, 5.0],
+                           'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]})
+
+        def f(dfgb, arg1):
+            return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby(dfgb.grouper)
+
+        def g(dfgb, arg2):
+            return dfgb.sum() / dfgb.sum().sum() + arg2
+
+        def h(df, arg3):
+            return df.x + df.y - arg3
+
+        result = (df
+                  .groupby('group')
+                  .pipe(f, 0)
+                  .pipe(g, 10)
+                  .pipe(h, 100))
+
+        # Assert the results here
+        index = pd.Index(['A', 'B', 'C'], name='group')
+        expected = pd.Series([-79.5160891089, -78.4839108911, None], index=index)
+
+        assert_series_equal(expected, result)
+
 
 def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
     tups = lmap(tuple, df[keys].values)