Skip to content

Commit b3686e1

Browse files
committed
ENH: Add pipe method to GroupBy (fixes pandas-dev#10353)
1 parent e8d4243 commit b3686e1

File tree

6 files changed

+207
-12
lines changed

6 files changed

+207
-12
lines changed

doc/source/groupby.rst

+61
Original file line numberDiff line numberDiff line change
@@ -1002,6 +1002,67 @@ See the :ref:`visualization documentation<visualization.box>` for more.
10021002
to ``df.boxplot(by="g")``. See :ref:`here<visualization.box.return>` for
10031003
an explanation.
10041004

1005+
1006+
.. _groupby.pipe:
1007+
1008+
Piping function calls
1009+
~~~~~~~~~~~~~~~~~~~~~
1010+
1011+
.. versionadded:: 0.17.0
1012+
1013+
Similar to the funcionality provided by ``DataFrames`` and ``Series``, functions
1014+
that take ``GroupBy`` objects can be chained together using a ``pipe`` method to
1015+
allow for a cleaner, more readable syntax.
1016+
1017+
Imagine that one had functions f, g, and h that each takes a ``DataFrameGroupBy``
1018+
as well as a single argument and returns a ``DataFrameGroupBy``, and one wanted
1019+
to apply these functions in succession to a grouped DataFrame. Instead of having
1020+
to deeply compose these functions and their arguments, such as:
1021+
1022+
.. code-block:: python
1023+
1024+
>>> h(g(f(df.groupby('group'), arg1), arg2), arg4)
1025+
1026+
one can write the following:
1027+
1028+
.. code-block:: python
1029+
1030+
>>> (df
1031+
.groupby('group')
1032+
.pipe(f, arg1)
1033+
.pipe(g, arg2)
1034+
.pipe(h, arg3))
1035+
1036+
For a more concrete example, imagine one wanted to group a DataFrame by column
1037+
'A' and the user wanted to take the square of the difference between the maximum
1038+
value of 'B' in each group and the overal minimum value of 'C' (across all
1039+
groups). One could write this as a pipeline of functions applied to the original
1040+
dataframe:
1041+
1042+
.. code-block:: python
1043+
1044+
def f(dfgb):
1045+
"""
1046+
Take a DataFrameGroupBy and return a Series
1047+
where each value corresponds to the maximum
1048+
value of column 'B' in each group minus the
1049+
global minimum of column 'C'.
1050+
"""
1051+
return dfgb.B.max() - dfgb.C.min().min()
1052+
1053+
def square(srs):
1054+
"""
1055+
Take a Series and transform it by
1056+
squaring each value.
1057+
"""
1058+
return srs ** 2
1059+
1060+
res = df.groupby('A').pipe(f).pipe(square)
1061+
1062+
1063+
For more details on pipeline functionality, see :ref:`here<basics.pipe>`.
1064+
1065+
10051066
Examples
10061067
--------
10071068

doc/source/whatsnew/v0.17.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,9 @@ Other enhancements
468468
- ``pd.read_csv`` can now read bz2-compressed files incrementally, and the C parser can read bz2-compressed files from AWS S3 (:issue:`11070`, :issue:`11072`).
469469

470470

471+
- ``GroupBy`` objects now have a ``pipe`` method, similar to the one on ``DataFrame`` and ``Series`` that allow for functions that take a ``GroupBy`` to be composed in a clean, readable syntax. See the :ref:`documentation <groupby.pipe>` for more.
472+
473+
471474
.. _whatsnew_0170.api:
472475

473476
.. _whatsnew_0170.api_breaking:

pandas/core/generic.py

+4-10
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
AbstractMethodError)
2727
import pandas.core.nanops as nanops
2828
from pandas.util.decorators import Appender, Substitution, deprecate_kwarg
29+
from pandas.tools.util import _pipe
2930
from pandas.core import config
3031

3132

@@ -2169,7 +2170,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
21692170
-----
21702171
21712172
Use ``.pipe`` when chaining together functions that expect
2172-
on Series or DataFrames. Instead of writing
2173+
on Series, DataFrames, or GroupBys. Instead of writing
21732174
21742175
>>> f(g(h(df), arg1=a), arg2=b, arg3=c)
21752176
@@ -2191,22 +2192,15 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No
21912192
21922193
See Also
21932194
--------
2195+
pandas.GroupBy.pipe
21942196
pandas.DataFrame.apply
21952197
pandas.DataFrame.applymap
21962198
pandas.Series.map
21972199
"""
21982200
)
21992201
@Appender(_shared_docs['pipe'] % _shared_doc_kwargs)
22002202
def pipe(self, func, *args, **kwargs):
2201-
if isinstance(func, tuple):
2202-
func, target = func
2203-
if target in kwargs:
2204-
msg = '%s is both the pipe target and a keyword argument' % target
2205-
raise ValueError(msg)
2206-
kwargs[target] = self
2207-
return func(*args, **kwargs)
2208-
else:
2209-
return func(self, *args, **kwargs)
2203+
return _pipe(self, func, *args, **kwargs)
22102204

22112205
#----------------------------------------------------------------------
22122206
# Attribute access

pandas/core/groupby.py

+55-1
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@
1414
from pandas.core.base import PandasObject
1515
from pandas.core.categorical import Categorical
1616
from pandas.core.frame import DataFrame
17-
from pandas.core.generic import NDFrame
17+
from pandas.core.generic import NDFrame, _pipe
1818
from pandas.core.index import Index, MultiIndex, CategoricalIndex, _ensure_index
1919
from pandas.core.internals import BlockManager, make_block
2020
from pandas.core.series import Series
2121
from pandas.core.panel import Panel
2222
from pandas.util.decorators import (cache_readonly, Appender, make_signature,
2323
deprecate_kwarg)
24+
from pandas.tools.util import _pipe
2425
import pandas.core.algorithms as algos
2526
import pandas.core.common as com
2627
from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
@@ -1076,6 +1077,59 @@ def tail(self, n=5):
10761077
tail = obj[in_tail]
10771078
return tail
10781079

1080+
def pipe(self, func, *args, **kwargs):
1081+
""" Apply a function with arguments to this GroupBy object
1082+
1083+
.. versionadded:: 0.17.0
1084+
1085+
Parameters
1086+
----------
1087+
func : callable or tuple of (callable, string)
1088+
Function to apply to this GroupBy or, alternatively, a
1089+
``(callable, data_keyword)`` tuple where ``data_keyword`` is a
1090+
string indicating the keyword of `callable`` that expects the
1091+
%(klass)s.
1092+
args : iterable, optional
1093+
positional arguments passed into ``func``.
1094+
kwargs : any, dictionary
1095+
a dictionary of keyword arguments passed into ``func``.
1096+
1097+
Returns
1098+
-------
1099+
object : the return type of ``func``.
1100+
1101+
Notes
1102+
-----
1103+
1104+
Use ``.pipe`` when chaining together functions that expect
1105+
a GroupBy, or when alternating between functions that take
1106+
a DataFrame and a GroupBy.
1107+
1108+
Assuming that one has a function f that takes and returns
1109+
a DataFrameGroupBy, a function g that takes a DataFrameGroupBy
1110+
and returns a DataFrame, and a function h that takes a DataFrame,
1111+
instead of having to write:
1112+
1113+
>>> f(g(h(df.groupby('group')), arg1=a), arg2=b, arg3=c)
1114+
1115+
You can write
1116+
1117+
>>> (df
1118+
... .groupby('group')
1119+
... .pipe(f, arg1)
1120+
... .pipe(g, arg2)
1121+
... .pipe(h, arg3))
1122+
1123+
1124+
See Also
1125+
--------
1126+
pandas.Series.pipe
1127+
pandas.DataFrame.pipe
1128+
pandas.GroupBy.apply
1129+
"""
1130+
return _pipe(self, func, *args, **kwargs)
1131+
1132+
10791133
def _cumcount_array(self, arr=None, ascending=True):
10801134
"""
10811135
arr is where cumcount gets its values from

pandas/tests/test_groupby.py

+62-1
Original file line numberDiff line numberDiff line change
@@ -5159,7 +5159,7 @@ def test_tab_completion(self):
51595159
'resample', 'cummin', 'fillna', 'cumsum', 'cumcount',
51605160
'all', 'shift', 'skew', 'bfill', 'ffill',
51615161
'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
5162-
'cov', 'dtypes', 'diff', 'idxmax', 'idxmin'
5162+
'cov', 'dtypes', 'diff', 'idxmax', 'idxmin', 'pipe'
51635163
])
51645164
self.assertEqual(results, expected)
51655165

@@ -5467,6 +5467,7 @@ def test_func(x):
54675467
expected = DataFrame()
54685468
tm.assert_frame_equal(result, expected)
54695469

5470+
54705471
def test_first_last_max_min_on_time_data(self):
54715472
# GH 10295
54725473
# Verify that NaT is not in the result of max, min, first and last on
@@ -5512,6 +5513,66 @@ def test_sort(x):
55125513
g.apply(test_sort)
55135514

55145515

5516+
def test_pipe(self):
5517+
# Test the pipe method of DataFrameGroupBy.
5518+
# Issue #10353
5519+
5520+
random_state = np.random.RandomState(1234567890)
5521+
5522+
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
5523+
'foo', 'bar', 'foo', 'foo'],
5524+
'B': random_state.randn(8),
5525+
'C': random_state.randn(8)})
5526+
5527+
def f(dfgb):
5528+
return dfgb.B.max() - dfgb.C.min().min()
5529+
5530+
def square(srs):
5531+
return srs ** 2
5532+
5533+
# Note that the transformations are
5534+
# GroupBy -> Series
5535+
# Series -> Series
5536+
# This then chains the GroupBy.pipe and the
5537+
# NDFrame.pipe methods
5538+
res = df.groupby('A').pipe(f).pipe(square)
5539+
5540+
index = Index([u'bar', u'foo'], dtype='object', name=u'A')
5541+
expected = pd.Series([8.99110003361, 8.17516964785], name='B', index=index)
5542+
5543+
assert_series_equal(expected, res)
5544+
5545+
5546+
def test_pipe_args(self):
5547+
# Test passing args to the pipe method of DataFrameGroupBy.
5548+
# Issue #10353
5549+
5550+
df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'],
5551+
'x': [1.0, 2.0, 3.0, 2.0, 5.0],
5552+
'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]})
5553+
5554+
def f(dfgb, arg1):
5555+
return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby(dfgb.grouper)
5556+
5557+
def g(dfgb, arg2):
5558+
return dfgb.sum() / dfgb.sum().sum() + arg2
5559+
5560+
def h(df, arg3):
5561+
return df.x + df.y - arg3
5562+
5563+
res = (df
5564+
.groupby('group')
5565+
.pipe(f, 0)
5566+
.pipe(g, 10)
5567+
.pipe(h, 100))
5568+
5569+
# Assert the results here
5570+
index = pd.Index(['A', 'B', 'C'], name='group')
5571+
expected = pd.Series([-79.5160891089, -78.4839108911, None], index=index)
5572+
5573+
assert_series_equal(expected, res)
5574+
5575+
55155576
def assert_fp_equal(a, b):
55165577
assert (np.abs(a - b) < 1e-12).all()
55175578

pandas/tools/util.py

+22
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,25 @@ def compose(*funcs):
4848
"""Compose 2 or more callables"""
4949
assert len(funcs) > 1, 'At least 2 callables must be passed to compose'
5050
return reduce(_compose2, funcs)
51+
52+
53+
def _pipe(obj, func, *args, **kwargs):
54+
"""
55+
Apply a function to a obj either by
56+
passing the obj as the first argument
57+
to the function or, in the case that
58+
the func is a tuple, interpret the first
59+
element of the tuple as a function and
60+
pass the obj to that function as a keyword
61+
arguemnt whose key is the value of the
62+
second element of the tuple
63+
"""
64+
if isinstance(func, tuple):
65+
func, target = func
66+
if target in kwargs:
67+
msg = '%s is both the pipe target and a keyword argument' % target
68+
raise ValueError(msg)
69+
kwargs[target] = obj
70+
return func(*args, **kwargs)
71+
else:
72+
return func(obj, *args, **kwargs)

0 commit comments

Comments
 (0)