From d73f3327e5a2c6d6a038782c31ab527a1484e368 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Sat, 23 Jan 2016 07:51:10 -0600 Subject: [PATCH 1/2] ENH BinGrouper use BaseGrouper's apply The `BinGrouper.apply` and `BaseGrouper.apply` have different output types. To make them consistent, remove `BinGrouper.apply` and let it use the same method as the superclass `BaseGrouper`. This requires changing `BinGrouper.groupings` to return a list of `Grouping` objects (there will always only be one) instead of `None`. --- doc/source/whatsnew/v0.18.1.txt | 66 +++++++++++++++++++++++++++++++++ pandas/core/groupby.py | 23 +----------- pandas/tests/test_groupby.py | 34 +++++++++++++++++ 3 files changed, 102 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 1179a347e4c46..c3a0967e8193c 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -72,15 +72,81 @@ API changes - ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`) + - ``pd.show_versions()`` now includes ``pandas_datareader`` version (:issue:`12740`) +- Using ``apply`` on resampling groupby operations (e.g. ``df.groupby(pd.TimeGrouper(freq='M', key='date')).apply(...)``) now has the same output types as similar ``apply``s on other groupby operations (e.g. ``df.groupby(pd.Grouper(key='color')).apply(...)``). (:issue:`11742`). + +Previous behavior: + +.. code-block:: python + + In [1]: df = pd.DataFrame({'date': pd.to_datetime(['10/10/2000', '11/10/2000']), 'value': [10, 13]}) + + In [2]: df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x.value.sum()) + Out[2]: + ... + TypeError: cannot concatenate a non-NDFrame object + + In [3]: df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum()) + Out[3]: + date + 2000-10-31 value 10 + 2000-11-30 value 13 + dtype: int64 + + In [3]: type(df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum())) + Out[3]: pandas.core.series.Series + + + In [4]: df.groupby(pd.Grouper(key='date')).apply(lambda x: x.value.sum()) + Out[4]: + date + 2000-10-10 10 + 2000-11-10 13 + dtype: int64 + + In [5]: type(df.groupby(pd.Grouper(key='date')).apply(lambda x: x.value.sum())) + Out[5]: pandas.core.series.Series + + + In [6]: df.groupby(pd.Grouper(key='date')).apply(lambda x: x[['value']].sum()) + Out[6]: + value + date + 2000-10-10 10 + 2000-11-10 13 + + In [7]: type(df.groupby(pd.Grouper(key='date')).apply(lambda x: x[['value']].sum())) + Out[7]: pandas.core.frame.DataFrame + + +New Behavior: +.. code-block:: python + In [1]: df = pd.DataFrame({'date': pd.to_datetime(['10/10/2000', '11/10/2000']), 'value': [10, 13]}) + In [2]: df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x.value.sum()) + Out[2]: + date + 2000-10-31 10 + 2000-11-30 13 + Freq: M, dtype: int64 + In [3]: type(df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x.value.sum())) + Out[3]: pandas.core.series.Series + In [4]: df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum()) + Out[4]: + value + date + 2000-10-31 10 + 2000-11-30 13 + In [5]: type(df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum())) + Out[5]: pandas.core.frame.DataFrame .. _whatsnew_0181.deprecations: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 398e37d52d7ba..066afc55e442f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2004,25 +2004,6 @@ def get_iterator(self, data, axis=0): if start < length: yield self.binlabels[-1], slicer(start, None) - def apply(self, f, data, axis=0): - result_keys = [] - result_values = [] - mutated = False - for key, group in self.get_iterator(data, axis=axis): - object.__setattr__(group, 'name', key) - - # group might be modified - group_axes = _get_axes(group) - res = f(group) - - if not _is_indexed_like(res, group_axes): - mutated = True - - result_keys.append(key) - result_values.append(res) - - return result_keys, result_values, mutated - @cache_readonly def indices(self): indices = collections.defaultdict(list) @@ -2071,8 +2052,8 @@ def names(self): @property def groupings(self): - # for compat - return None + return [Grouping(lvl, lvl, in_axis=False, level=None, name=name) + for lvl, name in zip(self.levels, self.names)] def agg_series(self, obj, func): dummy = obj[:0] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index ff9fd7dfb5980..dfd4862c6a8a5 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -4824,6 +4824,40 @@ def test_timegrouper_get_group(self): result = grouped.get_group(dt) assert_frame_equal(result, expected) + def test_timegrouper_apply_return_type_series(self): + # Using `apply` with the `TimeGrouper` should give the + # same return type as an `apply` with a `Grouper`. + df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], + 'value': [10, 13]}) + df_dt = df.copy() + df_dt['date'] = pd.to_datetime(df_dt['date']) + + def sumfunc_series(x): + return pd.Series([x['value'].sum()], ('sum',)) + + expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_series) + result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date')) + .apply(sumfunc_series)) + assert_frame_equal(result.reset_index(drop=True), + expected.reset_index(drop=True)) + + def test_timegrouper_apply_return_type_value(self): + # Using `apply` with the `TimeGrouper` should give the + # same return type as an `apply` with a `Grouper`. + df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], + 'value': [10, 13]}) + df_dt = df.copy() + df_dt['date'] = pd.to_datetime(df_dt['date']) + + def sumfunc_value(x): + return x.value.sum() + + expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value) + result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date')) + .apply(sumfunc_value)) + assert_series_equal(result.reset_index(drop=True), + expected.reset_index(drop=True)) + def test_cumcount(self): df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) g = df.groupby('A') From 8cf618faecf3c61edb954cbfbda5c942ff1f189b Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Thu, 31 Mar 2016 11:40:08 -0700 Subject: [PATCH 2/2] DOC Modify docs for CR comments --- doc/source/whatsnew/v0.18.1.txt | 76 ++++++++------------------------- pandas/tests/test_groupby.py | 2 + 2 files changed, 19 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index c3a0967e8193c..48ec3f8240bcd 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -77,77 +77,35 @@ API changes - Using ``apply`` on resampling groupby operations (e.g. ``df.groupby(pd.TimeGrouper(freq='M', key='date')).apply(...)``) now has the same output types as similar ``apply``s on other groupby operations (e.g. ``df.groupby(pd.Grouper(key='color')).apply(...)``). (:issue:`11742`). + +New Behavior: + +.. ipython:: python + + df = pd.DataFrame({'date': pd.to_datetime(['10/10/2000', '11/10/2000']), 'value': [10, 13]}) + df + # Output is a Series + df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x.value.sum()) + # Output is a DataFrame + df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum()) + Previous behavior: .. code-block:: python - In [1]: df = pd.DataFrame({'date': pd.to_datetime(['10/10/2000', '11/10/2000']), 'value': [10, 13]}) - - In [2]: df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x.value.sum()) - Out[2]: + In [1]: df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x.value.sum()) + Out[1]: ... TypeError: cannot concatenate a non-NDFrame object - In [3]: df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum()) - Out[3]: + # Output is a Series + In [2]: df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum()) + Out[2]: date 2000-10-31 value 10 2000-11-30 value 13 dtype: int64 - In [3]: type(df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum())) - Out[3]: pandas.core.series.Series - - - In [4]: df.groupby(pd.Grouper(key='date')).apply(lambda x: x.value.sum()) - Out[4]: - date - 2000-10-10 10 - 2000-11-10 13 - dtype: int64 - - In [5]: type(df.groupby(pd.Grouper(key='date')).apply(lambda x: x.value.sum())) - Out[5]: pandas.core.series.Series - - - In [6]: df.groupby(pd.Grouper(key='date')).apply(lambda x: x[['value']].sum()) - Out[6]: - value - date - 2000-10-10 10 - 2000-11-10 13 - - In [7]: type(df.groupby(pd.Grouper(key='date')).apply(lambda x: x[['value']].sum())) - Out[7]: pandas.core.frame.DataFrame - - -New Behavior: - -.. code-block:: python - - In [1]: df = pd.DataFrame({'date': pd.to_datetime(['10/10/2000', '11/10/2000']), 'value': [10, 13]}) - - In [2]: df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x.value.sum()) - Out[2]: - date - 2000-10-31 10 - 2000-11-30 13 - Freq: M, dtype: int64 - - In [3]: type(df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x.value.sum())) - Out[3]: pandas.core.series.Series - - - In [4]: df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum()) - Out[4]: - value - date - 2000-10-31 10 - 2000-11-30 13 - - In [5]: type(df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum())) - Out[5]: pandas.core.frame.DataFrame - .. _whatsnew_0181.deprecations: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index dfd4862c6a8a5..28038e02b64ca 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -4827,6 +4827,7 @@ def test_timegrouper_get_group(self): def test_timegrouper_apply_return_type_series(self): # Using `apply` with the `TimeGrouper` should give the # same return type as an `apply` with a `Grouper`. + # Issue #11742 df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], 'value': [10, 13]}) df_dt = df.copy() @@ -4844,6 +4845,7 @@ def sumfunc_series(x): def test_timegrouper_apply_return_type_value(self): # Using `apply` with the `TimeGrouper` should give the # same return type as an `apply` with a `Grouper`. + # Issue #11742 df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], 'value': [10, 13]}) df_dt = df.copy()