From ef1ff13f1e2c3cb2b89561af24fa38b6b807d263 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Mon, 10 Oct 2016 17:34:51 +0100 Subject: [PATCH] DOC: Add details to DataFrame groupby transform Add requirements for user function in groupby transform closes #13543 [skip ci] --- doc/source/groupby.rst | 38 +++++++++++++++++++++++++++++++++----- pandas/core/groupby.py | 15 +++++++++++++++ 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 8484ccd69a983..cbe3588104439 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -580,9 +580,21 @@ Transformation -------------- The ``transform`` method returns an object that is indexed the same (same size) -as the one being grouped. Thus, the passed transform function should return a -result that is the same size as the group chunk. For example, suppose we wished -to standardize the data within each group: +as the one being grouped. The transform function must: + +* Return a result that is either the same size as the group chunk or + broadcastable to the size of the group chunk (e.g., a scalar, + ``grouped.transform(lambda x: x.iloc[-1])``). +* Operate column-by-column on the group chunk. The transform is applied to + the first group chunk using chunk.apply. +* Not perform in-place operations on the group chunk. Group chunks should + be treated as immutable, and changes to a group chunk may produce unexpected + results. For example, when using ``fillna``, ``inplace`` must be ``False`` + (``grouped.transform(lambda x: x.fillna(inplace=False))``). +* (Optionally) operates on the entire group chunk. If this is supported, a + fast path is used starting from the *second* chunk. + +For example, suppose we wished to standardize the data within each group: .. ipython:: python @@ -620,6 +632,21 @@ We can also visually compare the original and transformed data sets. @savefig groupby_transform_plot.png compare.plot() +Transformation functions that have lower dimension outputs are broadcast to +match the shape of the input array. + +.. ipython:: python + + data_range = lambda x: x.max() - x.min() + ts.groupby(key).transform(data_range) + +Alternatively the built-in methods can be could be used to produce the same +outputs + +.. ipython:: python + + ts.groupby(key).transform('max') - ts.groupby(key).transform('min') + Another common data transform is to replace missing data with the group mean. .. ipython:: python @@ -664,8 +691,9 @@ and that the transformed data contains no NAs. .. note:: - Some functions when applied to a groupby object will automatically transform the input, returning - an object of the same shape as the original. Passing ``as_index=False`` will not affect these transformation methods. + Some functions when applied to a groupby object will automatically transform + the input, returning an object of the same shape as the original. Passing + ``as_index=False`` will not affect these transformation methods. For example: ``fillna, ffill, bfill, shift``. diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ba2de295fa0a9..c52ddb8bf7016 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3674,10 +3674,25 @@ def transform(self, func, *args, **kwargs): Each subframe is endowed the attribute 'name' in case you need to know which group you are working on. + The current implementation imposes three requirements on f: + + * f must return a value that either has the same shape as the input + subframe or can be broadcast to the shape of the input subframe. + For example, f returns a scalar it will be broadcast to have the + same shape as the input subframe. + * f must support application column-by-column in the subframe. If f + also supports application to the entire subframe, then a fast path + is used starting from the second chunk. + * f must not mutate subframes. Mutation is not supported and may + produce unexpected results. + Examples -------- >>> grouped = df.groupby(lambda x: mapping[x]) + # Same shape >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + # Broadcastable + >>> grouped.transform(lambda x: x.max() - x.min()) """ # optimized transforms