ENH: Add pipe method to GroupBy (fixes pandas-dev#10353)

ghl3 · ghl3 · commit b3686e1e254d · 2015-09-13T12:46:07.000-04:00
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -1002,6 +1002,67 @@ See the :ref:`visualization documentation<visualization.box>` for more.
   to ``df.boxplot(by="g")``. See :ref:`here<visualization.box.return>` for
   an explanation.
 
+
+.. _groupby.pipe:
+
+Piping function calls
+~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.17.0
+
+Similar to the funcionality provided by ``DataFrames`` and ``Series``, functions
+that take ``GroupBy`` objects can be chained together using a ``pipe`` method to
+allow for a cleaner, more readable syntax.
+
+Imagine that one had functions f, g, and h that each takes a ``DataFrameGroupBy``
+as well as a single argument and returns a ``DataFrameGroupBy``, and one wanted
+to apply these functions in succession to a grouped DataFrame.  Instead of having
+to deeply compose these functions and their arguments, such as:
+
+.. code-block:: python
+
+   >>> h(g(f(df.groupby('group'), arg1), arg2), arg4)
+
+one can write the following:
+
+.. code-block:: python
+
+  >>> (df
+         .groupby('group')
+         .pipe(f, arg1)
+         .pipe(g, arg2)
+         .pipe(h, arg3))
+
+For a more concrete example, imagine one wanted to group a DataFrame by column
+'A' and the user wanted to take the square of the difference between the maximum
+value of 'B' in each group and the overal minimum value of 'C' (across all
+groups). One could write this as a pipeline of functions applied to the original
+dataframe:
+
+.. code-block:: python
+
+    def f(dfgb):
+        """
+        Take a DataFrameGroupBy and return a Series
+        where each value corresponds to the maximum
+        value of column 'B' in each group minus the
+        global minimum of column 'C'.
+        """
+        return dfgb.B.max() - dfgb.C.min().min()
+
+    def square(srs):
+        """
+        Take a Series and transform it by
+        squaring each value.
+        """
+        return srs ** 2
+
+    res = df.groupby('A').pipe(f).pipe(square)
+
+
+For more details on pipeline functionality, see :ref:`here<basics.pipe>`.
+
+
 Examples
 --------
 
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -468,6 +468,9 @@ Other enhancements
 - ``pd.read_csv`` can now read bz2-compressed files incrementally, and the C parser can read bz2-compressed files from AWS S3 (:issue:`11070`, :issue:`11072`).
 
 
+- ``GroupBy`` objects now have a ``pipe`` method, similar to the one on ``DataFrame`` and ``Series`` that allow for functions that take a ``GroupBy`` to be composed in a clean, readable syntax.  See the :ref:`documentation <groupby.pipe>` for more.
+
+
 .. _whatsnew_0170.api:
 
 .. _whatsnew_0170.api_breaking:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -26,6 +26,7 @@
                                 AbstractMethodError)
 import pandas.core.nanops as nanops
 from pandas.util.decorators import Appender, Substitution, deprecate_kwarg
+from pandas.tools.util import _pipe
 from pandas.core import config
 
 
@@ -2169,7 +2170,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
         -----
 
         Use ``.pipe`` when chaining together functions that expect
-        on Series or DataFrames. Instead of writing
+        on Series,  DataFrames, or GroupBys. Instead of writing
 
         >>> f(g(h(df), arg1=a), arg2=b, arg3=c)
 
@@ -2191,22 +2192,15 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
 
         See Also
         --------
+        pandas.GroupBy.pipe
         pandas.DataFrame.apply
         pandas.DataFrame.applymap
         pandas.Series.map
     """
     )
     @Appender(_shared_docs['pipe'] % _shared_doc_kwargs)
     def pipe(self, func, *args, **kwargs):
-        if isinstance(func, tuple):
-            func, target = func
-            if target in kwargs:
-                msg = '%s is both the pipe target and a keyword argument' % target
-                raise ValueError(msg)
-            kwargs[target] = self
-            return func(*args, **kwargs)
-        else:
-            return func(self, *args, **kwargs)
+        return _pipe(self, func, *args, **kwargs)
 
     #----------------------------------------------------------------------
     # Attribute access
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -14,13 +14,14 @@
 from pandas.core.base import PandasObject
 from pandas.core.categorical import Categorical
 from pandas.core.frame import DataFrame
-from pandas.core.generic import NDFrame
+from pandas.core.generic import NDFrame, _pipe
 from pandas.core.index import Index, MultiIndex, CategoricalIndex, _ensure_index
 from pandas.core.internals import BlockManager, make_block
 from pandas.core.series import Series
 from pandas.core.panel import Panel
 from pandas.util.decorators import (cache_readonly, Appender, make_signature,
                                     deprecate_kwarg)
+from pandas.tools.util import _pipe
 import pandas.core.algorithms as algos
 import pandas.core.common as com
 from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
@@ -1076,6 +1077,59 @@ def tail(self, n=5):
         tail = obj[in_tail]
         return tail
 
+    def pipe(self, func, *args, **kwargs):
+        """ Apply a function with arguments to this GroupBy object
+
+        .. versionadded:: 0.17.0
+
+        Parameters
+        ----------
+        func : callable or tuple of (callable, string)
+               Function to apply to this GroupBy or, alternatively, a
+               ``(callable, data_keyword)`` tuple where ``data_keyword`` is a
+               string indicating the keyword of `callable`` that expects the
+               %(klass)s.
+        args : iterable, optional
+               positional arguments passed into ``func``.
+        kwargs : any, dictionary
+                 a dictionary of keyword arguments passed into ``func``.
+
+        Returns
+        -------
+        object : the return type of ``func``.
+
+        Notes
+        -----
+
+        Use ``.pipe`` when chaining together functions that expect
+        a GroupBy, or when alternating between functions that take
+        a DataFrame and a GroupBy.
+
+        Assuming that one has a function f that takes and returns
+        a DataFrameGroupBy, a function g that takes a DataFrameGroupBy
+        and returns a DataFrame, and a function h that takes a DataFrame,
+        instead of having to write:
+
+        >>> f(g(h(df.groupby('group')), arg1=a), arg2=b, arg3=c)
+
+        You can write
+
+        >>> (df
+        ...    .groupby('group')
+        ...    .pipe(f, arg1)
+        ...    .pipe(g, arg2)
+        ...    .pipe(h, arg3))
+
+
+        See Also
+        --------
+        pandas.Series.pipe
+        pandas.DataFrame.pipe
+        pandas.GroupBy.apply
+        """
+        return _pipe(self, func, *args, **kwargs)
+
+
     def _cumcount_array(self, arr=None, ascending=True):
         """
         arr is where cumcount gets its values from
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -5159,7 +5159,7 @@ def test_tab_completion(self):
                         'resample', 'cummin', 'fillna', 'cumsum', 'cumcount',
                         'all', 'shift', 'skew', 'bfill', 'ffill',
                         'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
-                        'cov', 'dtypes', 'diff', 'idxmax', 'idxmin'
+                        'cov', 'dtypes', 'diff', 'idxmax', 'idxmin', 'pipe'
                         ])
         self.assertEqual(results, expected)
 
@@ -5467,6 +5467,7 @@ def test_func(x):
         expected = DataFrame()
         tm.assert_frame_equal(result, expected)
 
+
     def test_first_last_max_min_on_time_data(self):
         # GH 10295
         # Verify that NaT is not in the result of max, min, first and last on
@@ -5512,6 +5513,66 @@ def test_sort(x):
                 g.apply(test_sort)
 
 
+    def test_pipe(self):
+        # Test the pipe method of DataFrameGroupBy.
+        # Issue #10353
+
+        random_state = np.random.RandomState(1234567890)
+
+        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+                              'foo', 'bar', 'foo', 'foo'],
+                        'B': random_state.randn(8),
+                        'C': random_state.randn(8)})
+
+        def f(dfgb):
+            return dfgb.B.max() - dfgb.C.min().min()
+
+        def square(srs):
+            return srs ** 2
+
+        # Note that the transformations are
+        # GroupBy -> Series
+        # Series -> Series
+        # This then chains the GroupBy.pipe and the
+        # NDFrame.pipe methods
+        res = df.groupby('A').pipe(f).pipe(square)
+
+        index = Index([u'bar', u'foo'], dtype='object', name=u'A')
+        expected = pd.Series([8.99110003361, 8.17516964785], name='B', index=index)
+
+        assert_series_equal(expected, res)
+
+
+    def test_pipe_args(self):
+        # Test passing args to the pipe method of DataFrameGroupBy.
+        # Issue #10353
+
+        df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'],
+                           'x': [1.0, 2.0, 3.0, 2.0, 5.0],
+                           'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]})
+
+        def f(dfgb, arg1):
+            return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby(dfgb.grouper)
+
+        def g(dfgb, arg2):
+            return dfgb.sum() / dfgb.sum().sum() + arg2
+
+        def h(df, arg3):
+            return df.x + df.y - arg3
+
+        res = (df
+               .groupby('group')
+               .pipe(f, 0)
+               .pipe(g, 10)
+               .pipe(h, 100))
+
+        # Assert the results here
+        index = pd.Index(['A', 'B', 'C'], name='group')
+        expected = pd.Series([-79.5160891089, -78.4839108911, None], index=index)
+
+        assert_series_equal(expected, res)
+
+
 def assert_fp_equal(a, b):
     assert (np.abs(a - b) < 1e-12).all()
 
diff --git a/pandas/tools/util.py b/pandas/tools/util.py
@@ -48,3 +48,25 @@ def compose(*funcs):
     """Compose 2 or more callables"""
     assert len(funcs) > 1, 'At least 2 callables must be passed to compose'
     return reduce(_compose2, funcs)
+
+
+def _pipe(obj, func, *args, **kwargs):
+    """
+    Apply a function to a obj either by
+    passing the obj as the first argument
+    to the function or, in the case that
+    the func is a tuple, interpret the first
+    element of the tuple as a function and
+    pass the obj to that function as a keyword
+    arguemnt whose key is the value of the
+    second element of the tuple
+    """
+    if isinstance(func, tuple):
+        func, target = func
+        if target in kwargs:
+            msg = '%s is both the pipe target and a keyword argument' % target
+            raise ValueError(msg)
+        kwargs[target] = obj
+        return func(*args, **kwargs)
+    else:
+        return func(obj, *args, **kwargs)