From a316f2ff20214bda2cbd24f22c1fc04f14f27baa Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 1 Mar 2014 14:03:43 -0500 Subject: [PATCH 1/4] BUG/API: allow TimeGrouper with other columns in a groupby (GH3794) --- pandas/core/groupby.py | 30 +++++++- pandas/core/series.py | 2 +- pandas/tests/test_groupby.py | 102 ++++++++++++++++++++++++++ pandas/tseries/resample.py | 37 +++++++++- pandas/tseries/tests/test_resample.py | 1 - 5 files changed, 166 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 86590d2319447..cb82999211ec7 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1479,6 +1479,8 @@ class Grouping(object): ---------- index : Index grouper : + obj : + axis : name : level : @@ -1493,7 +1495,7 @@ class Grouping(object): * groups : dict of {group -> label_list} """ - def __init__(self, index, grouper=None, name=None, level=None, + def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None, sort=True): self.name = name @@ -1566,6 +1568,30 @@ def __init__(self, index, grouper=None, name=None, level=None, if self.name is None: self.name = factor.name + # a passed TimeGrouper like + elif isinstance(self.grouper, CustomGrouper): + + # get the obj to work on + if self.grouper.name is not None: + name = self.grouper.name + if name not in obj._info_axis: + raise KeyError("The grouper name {0} is not found".format(name)) + ax = Index(obj[name],name=name) + else: + ax = obj._get_axis(axis) + if self.grouper.level is not None: + level = self.grouper.level + if isinstance(ax, MultiIndex): + level = ax._get_level_name(level) + ax = Index(ax.get_level_values(level), name=level) + else: + if not (level == 0 or level == ax.name): + raise ValueError("The grouper level {0} is not valid".format(level)) + + self.grouper = self.grouper._get_grouper_for_ax(ax) + if self.name is None: + self.name = self.grouper.name + # no level passed if not isinstance(self.grouper, (Series, np.ndarray)): self.grouper = self.index.map(self.grouper) @@ -1704,7 +1730,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): errmsg = "Categorical grouper must have len(grouper) == len(data)" raise AssertionError(errmsg) - ping = Grouping(group_axis, gpr, name=name, level=level, sort=sort) + ping = Grouping(group_axis, gpr, obj=obj, axis=axis, name=name, level=level, sort=sort) groupings.append(ping) if len(groupings) == 0: diff --git a/pandas/core/series.py b/pandas/core/series.py index bc5566ce4baa1..dd11b7bec9216 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2031,7 +2031,7 @@ def reindex_axis(self, labels, axis=0, **kwargs): raise ValueError("cannot reindex series on non-zero axis!") return self.reindex(index=labels, **kwargs) - def take(self, indices, axis=0, convert=True): + def take(self, indices, axis=0, convert=True, is_copy=False): """ Analogous to ndarray.take, return Series corresponding to requested indices diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index adca8389b8939..0d3270d976222 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2903,6 +2903,108 @@ def test_groupby_with_timegrouper_methods(self): self.assertTrue(isinstance(groups,dict)) self.assertTrue(len(groups) == 3) + def test_timegrouper_with_reg_groups(self): + + # GH 3794 + # allow combinateion of timegrouper/reg groups + + import datetime as DT + + df = DataFrame({ + 'Branch' : 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1,3,5,1,8,1,9,3], + 'Date' : [ + DT.datetime(2013,1,1,13,0), + DT.datetime(2013,1,1,13,5), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,12,2,12,0), + DT.datetime(2013,12,2,14,0), + ]}).set_index('Date') + + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10,18,3], + 'Date' : [ + DT.datetime(2013,12,31,0,0), + DT.datetime(2013,12,31,0,0), + DT.datetime(2013,12,31,0,0), + ]}).set_index(['Date','Buyer']) + result = df.groupby([pd.TimeGrouper('A'),'Buyer']).sum() + assert_frame_equal(result,expected) + + expected = DataFrame({ + 'Buyer': 'Carl Mark Carl Joe'.split(), + 'Quantity': [1,3,9,18], + 'Date' : [ + DT.datetime(2013,1,1,0,0), + DT.datetime(2013,1,1,0,0), + DT.datetime(2013,7,1,0,0), + DT.datetime(2013,7,1,0,0), + ]}).set_index(['Date','Buyer']) + + result = df.groupby([pd.TimeGrouper('6MS'),'Buyer']).sum() + assert_frame_equal(result,expected) + + df = DataFrame({ + 'Branch' : 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1,3,5,1,8,1,9,3], + 'Date' : [ + DT.datetime(2013,10,1,13,0), + DT.datetime(2013,10,1,13,5), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,10,2,12,0), + DT.datetime(2013,10,2,14,0), + ]}).set_index('Date') + + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark Carl Joe'.split(), + 'Quantity': [6,8,3,4,10], + 'Date' : [ + DT.datetime(2013,10,1,0,0), + DT.datetime(2013,10,1,0,0), + DT.datetime(2013,10,1,0,0), + DT.datetime(2013,10,2,0,0), + DT.datetime(2013,10,2,0,0), + ]}).set_index(['Date','Buyer']) + + result = df.groupby([pd.TimeGrouper('1D'),'Buyer']).sum() + assert_frame_equal(result,expected) + + result = df.groupby([pd.TimeGrouper('1M'),'Buyer']).sum() + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10,18,3], + 'Date' : [ + DT.datetime(2013,10,31,0,0), + DT.datetime(2013,10,31,0,0), + DT.datetime(2013,10,31,0,0), + ]}).set_index(['Date','Buyer']) + assert_frame_equal(result,expected) + + # passing the name + df = df.reset_index() + result = df.groupby([pd.TimeGrouper('1M',name='Date'),'Buyer']).sum() + assert_frame_equal(result,expected) + + self.assertRaises(KeyError, lambda : df.groupby([pd.TimeGrouper('1M',name='foo'),'Buyer']).sum()) + + # passing the level + df = df.set_index('Date') + result = df.groupby([pd.TimeGrouper('1M',level='Date'),'Buyer']).sum() + assert_frame_equal(result,expected) + result = df.groupby([pd.TimeGrouper('1M',level=0),'Buyer']).sum() + assert_frame_equal(result,expected) + + self.assertRaises(ValueError, lambda : df.groupby([pd.TimeGrouper('1M',level='foo'),'Buyer']).sum()) + def test_cumcount(self): df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) g = df.groupby('A') diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index e4221fdea083c..89d1daf2a8e0d 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -30,6 +30,8 @@ class TimeGrouper(CustomGrouper): nperiods : optional, integer convention : {'start', 'end', 'e', 's'} If axis is PeriodIndex + name : referring name, default None + level : referering level, default None Notes ----- @@ -39,7 +41,7 @@ class TimeGrouper(CustomGrouper): def __init__(self, freq='Min', closed=None, label=None, how='mean', nperiods=None, axis=0, fill_method=None, limit=None, loffset=None, kind=None, - convention=None, base=0): + convention=None, base=0, name=None, level=None): self.freq = to_offset(freq) end_types = set(['M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W']) @@ -70,6 +72,8 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', self.fill_method = fill_method self.limit = limit self.base = base + self.name = name + self.level = level def resample(self, obj): ax = obj._get_axis(self.axis) @@ -103,13 +107,42 @@ def get_grouper(self, obj): # return a tuple of (binner, grouper, obj) return self._get_time_grouper(obj) + def _get_grouper_for_ax(self, ax): + # return an ordering of the transformed group labels, + # suitable for multi-grouping, e.g the labels for + # the resampled intervals + + indexer = None + if not ax.is_monotonic: + indexer = ax.argsort(kind='quicksort') + ax = ax.take(indexer) + + if self.kind is None or self.kind == 'timestamp': + binner, bins, binlabels = self._get_time_bins(ax) + else: + binner, bins, binlabels = self._get_time_period_bins(ax) + + grp = BinGrouper(bins, binlabels) + + # create the grouper + l = [] + for key, group in grp.get_iterator(ax): + l.extend([key]*len(group)) + grouper = binner.__class__(l,freq=binner.freq,name=binner.name) + + # since we may have had to sort + # may need to reorder groups here + if indexer is not None: + grouper = grouper.take(indexer) + return grouper + def _ensure_sortedness(self, obj): # ensure that our object is sorted ax = obj._get_axis(self.axis) if not ax.is_monotonic: try: obj = obj.sort_index(axis=self.axis) - except TypeError: + except: obj = obj.sort_index() return obj diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 23b8905b2ae9a..20c6724726955 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1137,7 +1137,6 @@ def test_apply_iteration(self): _, grouper, _ = tg.get_grouper(df) # Errors - grouped = df.groupby(grouper, group_keys=False) f = lambda df: df['close'] / df['open'] From a7b19f9b4b59ffbcf0a1a8b80d124308d91cfa3c Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Mar 2014 10:16:15 -0400 Subject: [PATCH 2/4] CLN/API: replace groupby.CustomGrouper with Grouper rename internally Grouper to BaseGrouper to avoid conflict TimeGrouper to now inherit from Grouper --- pandas/core/api.py | 1 + pandas/core/groupby.py | 165 ++++++++++++++++++++++++++--------- pandas/tests/test_groupby.py | 27 ++++-- pandas/tseries/resample.py | 17 ++-- 4 files changed, 154 insertions(+), 56 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index 4d8d4dcda7589..3ebcb46cd98fa 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -6,6 +6,7 @@ from pandas.core.algorithms import factorize, match, unique, value_counts from pandas.core.common import isnull, notnull from pandas.core.categorical import Categorical +from pandas.core.groupby import Grouper from pandas.core.format import set_eng_float_format from pandas.core.index import Index, Int64Index, Float64Index, MultiIndex diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index cb82999211ec7..760dca377c6a5 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -139,6 +139,111 @@ def _last(x): else: return _last(x) +class Grouper(object): + """ + A Grouper allows the user to specify a groupby instruction + + Parameters + ---------- + key : groupby key, default None + level : name, int level number, default None + freq : string / freqency object, default None + sort : boolean, whether to sort the resulting labels, default True + + Returns + ------- + A specification for a groupby instruction + + Examples + -------- + df.groupby(Group(key='A')) : syntatic sugar for df.groupby('A') + df.groupby(Group(key='date',freq='60s')) : specify a resample on the column 'date' + df.groupby(Group(level='date',freq='60s',axis=1)) : + specify a resample on the level 'date' on the columns axis with a frequency of 60s + + """ + + def __new__(cls, *args, **kwargs): + if kwargs.get('freq') is not None: + from pandas.tseries.resample import TimeGrouper + cls = TimeGrouper + return super(Grouper, cls).__new__(cls) + + def __init__(self, key=None, level=None, freq=None, axis=None, sort=True): + self.key = key + self.level = level + self.freq = freq + self.axis = axis + self.sort = sort + self.grouper = None + + def get_grouper(self, obj): + + """ + Parameters + ---------- + obj : the subject object + + Returns + ------- + a tuple of binner, grouper, obj (possibly sorted) + """ + + # default is to not use a binner + return None, self.get_grouper_for_ax(obj), obj + + def get_grouper_for_ax(self, obj): + """ + given an object and the specifcations, return a grouper for this particular specification + + Parameters + ---------- + obj : the subject object + + Returns + ------- + grouper : an index mapping, or a BinGrouper like object + """ + + if self.key is not None and self.level is not None: + raise ValueError("The Grouper cannot specify both a key and a level!") + + # the key must be a valid info item + if self.key is not None: + key = self.key + if key not in obj._info_axis: + raise KeyError("The grouper name {0} is not found".format(key)) + ax = Index(obj[key],name=key) + + else: + ax = obj._get_axis(self.axis) + if self.level is not None: + level = self.level + + # if a level is given it must be a mi level or + # equivalent to the axis name + if isinstance(ax, MultiIndex): + + if isinstance(level, compat.string_types): + if obj.index.name != level: + raise ValueError('level name %s is not the name of the ' + 'index' % level) + elif level > 0: + raise ValueError('level > 0 only valid with MultiIndex') + ax = Index(ax.get_level_values(level), name=level) + + else: + if not (level == 0 or level == ax.name): + raise ValueError("The grouper level {0} is not valid".format(level)) + + return self._get_grouper_for_ax(ax) + + def _get_grouper_for_ax(self, ax): + return ax + + @property + def groups(self): + return self.grouper.groups class GroupBy(PandasObject): @@ -882,10 +987,9 @@ def _is_indexed_like(obj, axes): return False -class Grouper(object): - +class BaseGrouper(object): """ - + This is an internal Grouper class, which actually holds the generated groups """ def __init__(self, axis, groupings, sort=True, group_keys=True): @@ -1328,19 +1432,7 @@ def generate_bins_generic(values, binner, closed): return bins - -class CustomGrouper(object): - - def get_grouper(self, obj): - raise NotImplementedError - - # delegates - @property - def groups(self): - return self.grouper.groups - - -class BinGrouper(Grouper): +class BinGrouper(BaseGrouper): def __init__(self, bins, binlabels, filter_empty=False): self.bins = com._ensure_int64(bins) @@ -1495,7 +1587,7 @@ class Grouping(object): * groups : dict of {group -> label_list} """ - def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None, + def __init__(self, index, grouper=None, obj=None, name=None, level=None, sort=True): self.name = name @@ -1515,6 +1607,10 @@ def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None, self._was_factor = False self._should_compress = True + # we have a single grouper which may be a myriad of things, some of which are + # dependent on the passing in level + # + if level is not None: if not isinstance(level, int): if level not in index.names: @@ -1556,7 +1652,10 @@ def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None, else: if isinstance(self.grouper, (list, tuple)): self.grouper = com._asarray_tuplesafe(self.grouper) + + # a passed Categorical elif isinstance(self.grouper, Categorical): + factor = self.grouper self._was_factor = True @@ -1568,27 +1667,10 @@ def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None, if self.name is None: self.name = factor.name - # a passed TimeGrouper like - elif isinstance(self.grouper, CustomGrouper): - - # get the obj to work on - if self.grouper.name is not None: - name = self.grouper.name - if name not in obj._info_axis: - raise KeyError("The grouper name {0} is not found".format(name)) - ax = Index(obj[name],name=name) - else: - ax = obj._get_axis(axis) - if self.grouper.level is not None: - level = self.grouper.level - if isinstance(ax, MultiIndex): - level = ax._get_level_name(level) - ax = Index(ax.get_level_values(level), name=level) - else: - if not (level == 0 or level == ax.name): - raise ValueError("The grouper level {0} is not valid".format(level)) + # a passed Grouper like + elif isinstance(self.grouper, Grouper): - self.grouper = self.grouper._get_grouper_for_ax(ax) + self.grouper = self.grouper.get_grouper_for_ax(obj) if self.name is None: self.name = self.grouper.name @@ -1674,10 +1756,10 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): level = None key = group_axis - if isinstance(key, CustomGrouper): + if isinstance(key, Grouper): binner, gpr, obj = key.get_grouper(obj) return gpr, [], obj - elif isinstance(key, Grouper): + elif isinstance(key, BaseGrouper): return key, [], obj if not isinstance(key, (tuple, list)): @@ -1730,13 +1812,14 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): errmsg = "Categorical grouper must have len(grouper) == len(data)" raise AssertionError(errmsg) - ping = Grouping(group_axis, gpr, obj=obj, axis=axis, name=name, level=level, sort=sort) + ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort) groupings.append(ping) if len(groupings) == 0: raise ValueError('No group keys passed!') - grouper = Grouper(group_axis, groupings, sort=sort) + # create the internals grouper + grouper = BaseGrouper(group_axis, groupings, sort=sort) return grouper, exclusions, obj diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 0d3270d976222..2506a74e210d5 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2991,19 +2991,36 @@ def test_timegrouper_with_reg_groups(self): # passing the name df = df.reset_index() - result = df.groupby([pd.TimeGrouper('1M',name='Date'),'Buyer']).sum() + result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum() assert_frame_equal(result,expected) - self.assertRaises(KeyError, lambda : df.groupby([pd.TimeGrouper('1M',name='foo'),'Buyer']).sum()) + self.assertRaises(KeyError, lambda : df.groupby([pd.Grouper(freq='1M',key='foo'),'Buyer']).sum()) # passing the level df = df.set_index('Date') - result = df.groupby([pd.TimeGrouper('1M',level='Date'),'Buyer']).sum() + result = df.groupby([pd.Grouper(freq='1M',level='Date'),'Buyer']).sum() assert_frame_equal(result,expected) - result = df.groupby([pd.TimeGrouper('1M',level=0),'Buyer']).sum() + result = df.groupby([pd.Grouper(freq='1M',level=0),'Buyer']).sum() assert_frame_equal(result,expected) - self.assertRaises(ValueError, lambda : df.groupby([pd.TimeGrouper('1M',level='foo'),'Buyer']).sum()) + self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',level='foo'),'Buyer']).sum()) + + # multi names + df = df.copy() + df['Date'] = df.index + pd.offsets.MonthEnd(2) + result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum() + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10,18,3], + 'Date' : [ + DT.datetime(2013,11,30,0,0), + DT.datetime(2013,11,30,0,0), + DT.datetime(2013,11,30,0,0), + ]}).set_index(['Date','Buyer']) + assert_frame_equal(result,expected) + + # error as we have both a level and a name! + self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum()) def test_cumcount(self): df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 89d1daf2a8e0d..381d8937466b3 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -2,7 +2,7 @@ import numpy as np -from pandas.core.groupby import BinGrouper, CustomGrouper +from pandas.core.groupby import BinGrouper, Grouper from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod from pandas.tseries.index import DatetimeIndex, date_range from pandas.tseries.offsets import DateOffset, Tick, _delta_to_nanoseconds @@ -18,7 +18,7 @@ _DEFAULT_METHOD = 'mean' -class TimeGrouper(CustomGrouper): +class TimeGrouper(Grouper): """ Custom groupby class for time-interval grouping @@ -30,8 +30,6 @@ class TimeGrouper(CustomGrouper): nperiods : optional, integer convention : {'start', 'end', 'e', 's'} If axis is PeriodIndex - name : referring name, default None - level : referering level, default None Notes ----- @@ -41,11 +39,11 @@ class TimeGrouper(CustomGrouper): def __init__(self, freq='Min', closed=None, label=None, how='mean', nperiods=None, axis=0, fill_method=None, limit=None, loffset=None, kind=None, - convention=None, base=0, name=None, level=None): - self.freq = to_offset(freq) + convention=None, base=0, **kwargs): + freq = to_offset(freq) end_types = set(['M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W']) - rule = self.freq.rule_code + rule = freq.rule_code if (rule in end_types or ('-' in rule and rule[:rule.find('-')] in end_types)): if closed is None: @@ -66,14 +64,13 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', self.convention = convention or 'E' self.convention = self.convention.lower() - self.axis = axis self.loffset = loffset self.how = how self.fill_method = fill_method self.limit = limit self.base = base - self.name = name - self.level = level + + super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs) def resample(self, obj): ax = obj._get_axis(self.axis) From 2f667db4523f5e0c92236cbf59a48fcfefeccb53 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Mar 2014 08:12:14 -0400 Subject: [PATCH 3/4] DOC: update groupby docs for using pd.Grouper --- doc/source/groupby.rst | 50 ++++++++++++++++++++++++++++++++++-- doc/source/release.rst | 2 ++ doc/source/v0.14.0.txt | 2 ++ pandas/tests/test_groupby.py | 8 +++--- 4 files changed, 56 insertions(+), 6 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index b5c15f83bb9d3..6d97b2270a5a4 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -707,6 +707,52 @@ can be used as group keys. If so, the order of the levels will be preserved: data.groupby(factor).mean() +.. _groupby.specify: + +Grouping with a Grouper specification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Your may need to specify a bit more data to properly group. You can +use the ``pd.Grouper`` to provide this local control. + +.. ipython:: python + + import datetime as DT + + df = DataFrame({ + 'Branch' : 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1,3,5,1,8,1,9,3], + 'Date' : [ + DT.datetime(2013,1,1,13,0), + DT.datetime(2013,1,1,13,5), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,12,2,12,0), + DT.datetime(2013,12,2,14,0), + ]}) + + df + +Groupby a specific column with the desired frequency. This is like resampling. + +.. ipython:: python + + df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum() + +You have an ambiguous specification in that you have a named index and a column +that could be potential groupers. + +.. ipython:: python + + df = df.set_index('Date') + df['Date'] = df.index + pd.offsets.MonthEnd(2) + df.groupby([pd.Grouper(freq='6M',key='Date'),'Buyer']).sum() + + df.groupby([pd.Grouper(freq='6M',level='Date'),'Buyer']).sum() + Taking the first rows of each group ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -753,7 +799,7 @@ To select from a DataFrame or Series the nth item, use the nth method: g.nth(-1) -If you want to select the nth not-null method, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna, for a Series this just needs to be truthy. +If you want to select the nth not-null method, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna, for a Series this just needs to be truthy. .. ipython:: python @@ -808,7 +854,7 @@ column index name will be used as the name of the inserted column: 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], - }) + }) def compute_metrics(x): result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()} diff --git a/doc/source/release.rst b/doc/source/release.rst index 4890f22e98468..f5108effba48a 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -125,6 +125,8 @@ API Changes ``DataFrame.stack`` operations where the name of the column index is used as the name of the inserted column containing the pivoted data. +- Allow specification of a more complex groupby, via ``pd.Groupby`` (:issue:`3794`) + Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index cfee48d62928b..54fb96595769b 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -90,6 +90,8 @@ These are out-of-bounds selections g.nth(0, dropna='any') # similar to old behaviour +- Allow specification of a more complex groupby via ``pd.Groupby``, See :ref:`the docs `. (:issue:`3794`) + - Local variable usage has changed in :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 2506a74e210d5..cf2447c0e6dfa 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2933,7 +2933,7 @@ def test_timegrouper_with_reg_groups(self): DT.datetime(2013,12,31,0,0), DT.datetime(2013,12,31,0,0), ]}).set_index(['Date','Buyer']) - result = df.groupby([pd.TimeGrouper('A'),'Buyer']).sum() + result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum() assert_frame_equal(result,expected) expected = DataFrame({ @@ -2946,7 +2946,7 @@ def test_timegrouper_with_reg_groups(self): DT.datetime(2013,7,1,0,0), ]}).set_index(['Date','Buyer']) - result = df.groupby([pd.TimeGrouper('6MS'),'Buyer']).sum() + result = df.groupby([pd.Grouper(freq='6MS'),'Buyer']).sum() assert_frame_equal(result,expected) df = DataFrame({ @@ -2975,10 +2975,10 @@ def test_timegrouper_with_reg_groups(self): DT.datetime(2013,10,2,0,0), ]}).set_index(['Date','Buyer']) - result = df.groupby([pd.TimeGrouper('1D'),'Buyer']).sum() + result = df.groupby([pd.Grouper(freq='1D'),'Buyer']).sum() assert_frame_equal(result,expected) - result = df.groupby([pd.TimeGrouper('1M'),'Buyer']).sum() + result = df.groupby([pd.Grouper(freq='1M'),'Buyer']).sum() expected = DataFrame({ 'Buyer': 'Carl Joe Mark'.split(), 'Quantity': [10,18,3], From 5e965e956b2c420bfbd1b424967b46cd29e00459 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Mar 2014 18:12:32 -0400 Subject: [PATCH 4/4] CLN: refactor of groupby/resample to handle Grouper in a more elegant / cleaner way by keeping internal groupby state inside the Grouper rather than passing around lots of results DOC: minor doc edits for groupby.rst / v0.14.0 PEP8: minor pep changes --- doc/source/groupby.rst | 8 +++ doc/source/v0.14.0.txt | 7 ++- pandas/core/groupby.py | 80 ++++++++++++++++++++++-------- pandas/tests/test_groupby.py | 19 ++++++++ pandas/tseries/resample.py | 94 +++++++++++++++++------------------- 5 files changed, 134 insertions(+), 74 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 6d97b2270a5a4..cc5ebc730f94a 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -754,6 +754,8 @@ that could be potential groupers. df.groupby([pd.Grouper(freq='6M',level='Date'),'Buyer']).sum() +.. _groupby.nth: + Taking the first rows of each group ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -833,6 +835,9 @@ To see the order in which each row appears within its group, use the Examples -------- +Regrouping by factor +~~~~~~~~~~~~~~~~~~~~ + Regroup columns of a DataFrame according to their sum, and sum the aggregated ones. .. ipython:: python @@ -842,6 +847,9 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on df.groupby(df.sum(), axis=1).sum() +Returning a Series to propogate names +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Group DataFrame columns, compute a set of metrics and return a named Series. The Series name is used as the name for the column index. This is especially useful in conjunction with reshaping operations such as stacking in which the diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 54fb96595769b..0613d56604844 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -11,6 +11,7 @@ Highlights include: - MultIndexing Using Slicers - Joining a singly-indexed DataFrame with a multi-indexed DataFrame +- More flexible groupby specifications API changes ~~~~~~~~~~~ @@ -80,7 +81,7 @@ These are out-of-bounds selections g[['B']].head(1) - groupby ``nth`` now filters by default, with optional dropna argument to ignore - NaN (to replicate the previous behaviour.) + NaN (to replicate the previous behaviour.), See :ref:`the docs `. .. ipython:: python @@ -90,7 +91,8 @@ These are out-of-bounds selections g.nth(0, dropna='any') # similar to old behaviour -- Allow specification of a more complex groupby via ``pd.Groupby``, See :ref:`the docs `. (:issue:`3794`) +- Allow specification of a more complex groupby via ``pd.Groupby``, such as grouping + by a Time and a string field simultaneously. See :ref:`the docs `. (:issue:`3794`) - Local variable usage has changed in :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` @@ -123,6 +125,7 @@ These are out-of-bounds selections .. ipython:: python i[[0,1,2]].astype(np.int_) + - ``set_index`` no longer converts MultiIndexes to an Index of tuples. For example, the old behavior returned an Index in this case (:issue:`6459`): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 760dca377c6a5..7bf20d71cb301 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -170,12 +170,21 @@ def __new__(cls, *args, **kwargs): return super(Grouper, cls).__new__(cls) def __init__(self, key=None, level=None, freq=None, axis=None, sort=True): - self.key = key - self.level = level - self.freq = freq - self.axis = axis - self.sort = sort - self.grouper = None + self.key=key + self.level=level + self.freq=freq + self.axis=axis + self.sort=sort + + self.grouper=None + self.obj=None + self.indexer=None + self.binner=None + self.grouper=None + + @property + def ax(self): + return self.grouper def get_grouper(self, obj): @@ -189,20 +198,17 @@ def get_grouper(self, obj): a tuple of binner, grouper, obj (possibly sorted) """ - # default is to not use a binner - return None, self.get_grouper_for_ax(obj), obj + self.set_grouper(obj) + return self.binner, self.grouper, self.obj - def get_grouper_for_ax(self, obj): + def set_grouper(self, obj): """ - given an object and the specifcations, return a grouper for this particular specification + given an object and the specifcations, setup the internal grouper for this particular specification Parameters ---------- obj : the subject object - Returns - ------- - grouper : an index mapping, or a BinGrouper like object """ if self.key is not None and self.level is not None: @@ -236,10 +242,18 @@ def get_grouper_for_ax(self, obj): if not (level == 0 or level == ax.name): raise ValueError("The grouper level {0} is not valid".format(level)) - return self._get_grouper_for_ax(ax) + # possibly sort + if not ax.is_monotonic: + indexer = self.indexer = ax.argsort(kind='quicksort') + ax = ax.take(indexer) + obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False) + + self.obj = obj + self.grouper = ax + return self.grouper - def _get_grouper_for_ax(self, ax): - return ax + def get_binner_for_grouping(self, obj): + raise NotImplementedError @property def groups(self): @@ -1572,7 +1586,6 @@ class Grouping(object): index : Index grouper : obj : - axis : name : level : @@ -1670,9 +1683,11 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Grouper like elif isinstance(self.grouper, Grouper): - self.grouper = self.grouper.get_grouper_for_ax(obj) + # get the new grouper + grouper = self.grouper.get_binner_for_grouping(obj) + self.grouper = grouper if self.name is None: - self.name = self.grouper.name + self.name = grouper.name # no level passed if not isinstance(self.grouper, (Series, np.ndarray)): @@ -1742,8 +1757,28 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True): + """ + create and return a BaseGrouper, which is an internal + mapping of how to create the grouper indexers. + This may be composed of multiple Grouping objects, indicating + multiple groupers + + Groupers are ultimately index mappings. They can originate as: + index mappings, keys to columns, functions, or Groupers + + Groupers enable local references to axis,level,sort, while + the passed in axis, level, and sort are 'global'. + + This routine tries to figure of what the passing in references + are and then creates a Grouping for each one, combined into + a BaseGrouper. + + """ + group_axis = obj._get_axis(axis) + # validate thatthe passed level is compatible with the passed + # axis of the object if level is not None: if not isinstance(group_axis, MultiIndex): if isinstance(level, compat.string_types): @@ -1756,9 +1791,12 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): level = None key = group_axis + # a passed in Grouper, directly convert if isinstance(key, Grouper): - binner, gpr, obj = key.get_grouper(obj) - return gpr, [], obj + binner, grouper, obj = key.get_grouper(obj) + return grouper, [], obj + + # already have a BaseGrouper, just return it elif isinstance(key, BaseGrouper): return key, [], obj diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index cf2447c0e6dfa..4d47750660800 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2933,6 +2933,7 @@ def test_timegrouper_with_reg_groups(self): DT.datetime(2013,12,31,0,0), DT.datetime(2013,12,31,0,0), ]}).set_index(['Date','Buyer']) + result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum() assert_frame_equal(result,expected) @@ -3022,6 +3023,24 @@ def test_timegrouper_with_reg_groups(self): # error as we have both a level and a name! self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum()) + + # single groupers + expected = DataFrame({ 'Quantity' : [31], + 'Date' : [DT.datetime(2013,10,31,0,0)] }).set_index('Date') + result = df.groupby(pd.Grouper(freq='1M')).sum() + assert_frame_equal(result, expected) + + result = df.groupby([pd.Grouper(freq='1M')]).sum() + assert_frame_equal(result, expected) + + expected = DataFrame({ 'Quantity' : [31], + 'Date' : [DT.datetime(2013,11,30,0,0)] }).set_index('Date') + result = df.groupby(pd.Grouper(freq='1M',key='Date')).sum() + assert_frame_equal(result, expected) + + result = df.groupby([pd.Grouper(freq='1M',key='Date')]).sum() + assert_frame_equal(result, expected) + def test_cumcount(self): df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) g = df.groupby('A') diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 381d8937466b3..8ab7063eada17 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -70,14 +70,17 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', self.limit = limit self.base = base + # by definition we always sort + kwargs['sort'] = True + super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs) def resample(self, obj): - ax = obj._get_axis(self.axis) + self.set_grouper(obj) + ax = self.grouper - obj = self._ensure_sortedness(obj) if isinstance(ax, DatetimeIndex): - rs = self._resample_timestamps(obj) + rs = self._resample_timestamps() elif isinstance(ax, PeriodIndex): offset = to_offset(self.freq) if offset.n > 1: @@ -87,12 +90,13 @@ def resample(self, obj): self.kind = 'timestamp' if self.kind is None or self.kind == 'period': - rs = self._resample_periods(obj) + rs = self._resample_periods() else: - obj = obj.to_timestamp(how=self.convention) - rs = self._resample_timestamps(obj) + obj = self.obj.to_timestamp(how=self.convention) + self.set_grouper(obj) + rs = self._resample_timestamps() elif len(ax) == 0: - return obj + return self.obj else: # pragma: no cover raise TypeError('Only valid with DatetimeIndex or PeriodIndex') @@ -101,60 +105,42 @@ def resample(self, obj): return rs def get_grouper(self, obj): - # return a tuple of (binner, grouper, obj) - return self._get_time_grouper(obj) - - def _get_grouper_for_ax(self, ax): - # return an ordering of the transformed group labels, - # suitable for multi-grouping, e.g the labels for - # the resampled intervals + self.set_grouper(obj) + return self.get_binner_for_resample() - indexer = None - if not ax.is_monotonic: - indexer = ax.argsort(kind='quicksort') - ax = ax.take(indexer) + def get_binner_for_resample(self): + # create the BinGrouper + # assume that self.set_grouper(obj) has already been called + ax = self.ax if self.kind is None or self.kind == 'timestamp': - binner, bins, binlabels = self._get_time_bins(ax) + self.binner, bins, binlabels = self._get_time_bins(ax) else: - binner, bins, binlabels = self._get_time_period_bins(ax) + self.binner, bins, binlabels = self._get_time_period_bins(ax) - grp = BinGrouper(bins, binlabels) + self.grouper = BinGrouper(bins, binlabels) + return self.binner, self.grouper, self.obj + + def get_binner_for_grouping(self, obj): + # return an ordering of the transformed group labels, + # suitable for multi-grouping, e.g the labels for + # the resampled intervals + ax = self.set_grouper(obj) + self.get_binner_for_resample() # create the grouper + binner = self.binner l = [] - for key, group in grp.get_iterator(ax): + for key, group in self.grouper.get_iterator(ax): l.extend([key]*len(group)) grouper = binner.__class__(l,freq=binner.freq,name=binner.name) # since we may have had to sort # may need to reorder groups here - if indexer is not None: - grouper = grouper.take(indexer) + if self.indexer is not None: + grouper = grouper.take(self.indexer) return grouper - def _ensure_sortedness(self, obj): - # ensure that our object is sorted - ax = obj._get_axis(self.axis) - if not ax.is_monotonic: - try: - obj = obj.sort_index(axis=self.axis) - except: - obj = obj.sort_index() - return obj - - def _get_time_grouper(self, obj): - obj = self._ensure_sortedness(obj) - ax = obj._get_axis(self.axis) - - if self.kind is None or self.kind == 'timestamp': - binner, bins, binlabels = self._get_time_bins(ax) - else: - binner, bins, binlabels = self._get_time_period_bins(ax) - - grouper = BinGrouper(bins, binlabels) - return binner, grouper, obj - def _get_time_bins(self, ax): if not isinstance(ax, DatetimeIndex): raise TypeError('axis must be a DatetimeIndex, but got ' @@ -243,10 +229,14 @@ def _get_time_period_bins(self, ax): def _agg_method(self): return self.how if self.how else _DEFAULT_METHOD - def _resample_timestamps(self, obj): - axlabels = obj._get_axis(self.axis) + def _resample_timestamps(self): + # assumes set_grouper(obj) already called + axlabels = self.ax - binner, grouper, _ = self._get_time_grouper(obj) + self.get_binner_for_resample() + grouper = self.grouper + binner = self.binner + obj = self.obj # Determine if we're downsampling if axlabels.freq is not None or axlabels.inferred_freq is not None: @@ -286,8 +276,10 @@ def _resample_timestamps(self, obj): return result - def _resample_periods(self, obj): - axlabels = obj._get_axis(self.axis) + def _resample_periods(self): + # assumes set_grouper(obj) already called + axlabels = self.ax + obj = self.obj if len(axlabels) == 0: new_index = PeriodIndex(data=[], freq=self.freq)