Skip to content

Commit a7b19f9

Browse files
committed
CLN/API: replace groupby.CustomGrouper with Grouper
rename internally Grouper to BaseGrouper to avoid conflict TimeGrouper to now inherit from Grouper
1 parent a316f2f commit a7b19f9

File tree

4 files changed

+154
-56
lines changed

4 files changed

+154
-56
lines changed

pandas/core/api.py

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pandas.core.algorithms import factorize, match, unique, value_counts
77
from pandas.core.common import isnull, notnull
88
from pandas.core.categorical import Categorical
9+
from pandas.core.groupby import Grouper
910
from pandas.core.format import set_eng_float_format
1011
from pandas.core.index import Index, Int64Index, Float64Index, MultiIndex
1112

pandas/core/groupby.py

+124-41
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,111 @@ def _last(x):
139139
else:
140140
return _last(x)
141141

142+
class Grouper(object):
143+
"""
144+
A Grouper allows the user to specify a groupby instruction
145+
146+
Parameters
147+
----------
148+
key : groupby key, default None
149+
level : name, int level number, default None
150+
freq : string / freqency object, default None
151+
sort : boolean, whether to sort the resulting labels, default True
152+
153+
Returns
154+
-------
155+
A specification for a groupby instruction
156+
157+
Examples
158+
--------
159+
df.groupby(Group(key='A')) : syntatic sugar for df.groupby('A')
160+
df.groupby(Group(key='date',freq='60s')) : specify a resample on the column 'date'
161+
df.groupby(Group(level='date',freq='60s',axis=1)) :
162+
specify a resample on the level 'date' on the columns axis with a frequency of 60s
163+
164+
"""
165+
166+
def __new__(cls, *args, **kwargs):
167+
if kwargs.get('freq') is not None:
168+
from pandas.tseries.resample import TimeGrouper
169+
cls = TimeGrouper
170+
return super(Grouper, cls).__new__(cls)
171+
172+
def __init__(self, key=None, level=None, freq=None, axis=None, sort=True):
173+
self.key = key
174+
self.level = level
175+
self.freq = freq
176+
self.axis = axis
177+
self.sort = sort
178+
self.grouper = None
179+
180+
def get_grouper(self, obj):
181+
182+
"""
183+
Parameters
184+
----------
185+
obj : the subject object
186+
187+
Returns
188+
-------
189+
a tuple of binner, grouper, obj (possibly sorted)
190+
"""
191+
192+
# default is to not use a binner
193+
return None, self.get_grouper_for_ax(obj), obj
194+
195+
def get_grouper_for_ax(self, obj):
196+
"""
197+
given an object and the specifcations, return a grouper for this particular specification
198+
199+
Parameters
200+
----------
201+
obj : the subject object
202+
203+
Returns
204+
-------
205+
grouper : an index mapping, or a BinGrouper like object
206+
"""
207+
208+
if self.key is not None and self.level is not None:
209+
raise ValueError("The Grouper cannot specify both a key and a level!")
210+
211+
# the key must be a valid info item
212+
if self.key is not None:
213+
key = self.key
214+
if key not in obj._info_axis:
215+
raise KeyError("The grouper name {0} is not found".format(key))
216+
ax = Index(obj[key],name=key)
217+
218+
else:
219+
ax = obj._get_axis(self.axis)
220+
if self.level is not None:
221+
level = self.level
222+
223+
# if a level is given it must be a mi level or
224+
# equivalent to the axis name
225+
if isinstance(ax, MultiIndex):
226+
227+
if isinstance(level, compat.string_types):
228+
if obj.index.name != level:
229+
raise ValueError('level name %s is not the name of the '
230+
'index' % level)
231+
elif level > 0:
232+
raise ValueError('level > 0 only valid with MultiIndex')
233+
ax = Index(ax.get_level_values(level), name=level)
234+
235+
else:
236+
if not (level == 0 or level == ax.name):
237+
raise ValueError("The grouper level {0} is not valid".format(level))
238+
239+
return self._get_grouper_for_ax(ax)
240+
241+
def _get_grouper_for_ax(self, ax):
242+
return ax
243+
244+
@property
245+
def groups(self):
246+
return self.grouper.groups
142247

143248
class GroupBy(PandasObject):
144249

@@ -882,10 +987,9 @@ def _is_indexed_like(obj, axes):
882987
return False
883988

884989

885-
class Grouper(object):
886-
990+
class BaseGrouper(object):
887991
"""
888-
992+
This is an internal Grouper class, which actually holds the generated groups
889993
"""
890994

891995
def __init__(self, axis, groupings, sort=True, group_keys=True):
@@ -1328,19 +1432,7 @@ def generate_bins_generic(values, binner, closed):
13281432

13291433
return bins
13301434

1331-
1332-
class CustomGrouper(object):
1333-
1334-
def get_grouper(self, obj):
1335-
raise NotImplementedError
1336-
1337-
# delegates
1338-
@property
1339-
def groups(self):
1340-
return self.grouper.groups
1341-
1342-
1343-
class BinGrouper(Grouper):
1435+
class BinGrouper(BaseGrouper):
13441436

13451437
def __init__(self, bins, binlabels, filter_empty=False):
13461438
self.bins = com._ensure_int64(bins)
@@ -1495,7 +1587,7 @@ class Grouping(object):
14951587
* groups : dict of {group -> label_list}
14961588
"""
14971589

1498-
def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None,
1590+
def __init__(self, index, grouper=None, obj=None, name=None, level=None,
14991591
sort=True):
15001592

15011593
self.name = name
@@ -1515,6 +1607,10 @@ def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None,
15151607
self._was_factor = False
15161608
self._should_compress = True
15171609

1610+
# we have a single grouper which may be a myriad of things, some of which are
1611+
# dependent on the passing in level
1612+
#
1613+
15181614
if level is not None:
15191615
if not isinstance(level, int):
15201616
if level not in index.names:
@@ -1556,7 +1652,10 @@ def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None,
15561652
else:
15571653
if isinstance(self.grouper, (list, tuple)):
15581654
self.grouper = com._asarray_tuplesafe(self.grouper)
1655+
1656+
# a passed Categorical
15591657
elif isinstance(self.grouper, Categorical):
1658+
15601659
factor = self.grouper
15611660
self._was_factor = True
15621661

@@ -1568,27 +1667,10 @@ def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None,
15681667
if self.name is None:
15691668
self.name = factor.name
15701669

1571-
# a passed TimeGrouper like
1572-
elif isinstance(self.grouper, CustomGrouper):
1573-
1574-
# get the obj to work on
1575-
if self.grouper.name is not None:
1576-
name = self.grouper.name
1577-
if name not in obj._info_axis:
1578-
raise KeyError("The grouper name {0} is not found".format(name))
1579-
ax = Index(obj[name],name=name)
1580-
else:
1581-
ax = obj._get_axis(axis)
1582-
if self.grouper.level is not None:
1583-
level = self.grouper.level
1584-
if isinstance(ax, MultiIndex):
1585-
level = ax._get_level_name(level)
1586-
ax = Index(ax.get_level_values(level), name=level)
1587-
else:
1588-
if not (level == 0 or level == ax.name):
1589-
raise ValueError("The grouper level {0} is not valid".format(level))
1670+
# a passed Grouper like
1671+
elif isinstance(self.grouper, Grouper):
15901672

1591-
self.grouper = self.grouper._get_grouper_for_ax(ax)
1673+
self.grouper = self.grouper.get_grouper_for_ax(obj)
15921674
if self.name is None:
15931675
self.name = self.grouper.name
15941676

@@ -1674,10 +1756,10 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
16741756
level = None
16751757
key = group_axis
16761758

1677-
if isinstance(key, CustomGrouper):
1759+
if isinstance(key, Grouper):
16781760
binner, gpr, obj = key.get_grouper(obj)
16791761
return gpr, [], obj
1680-
elif isinstance(key, Grouper):
1762+
elif isinstance(key, BaseGrouper):
16811763
return key, [], obj
16821764

16831765
if not isinstance(key, (tuple, list)):
@@ -1730,13 +1812,14 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
17301812
errmsg = "Categorical grouper must have len(grouper) == len(data)"
17311813
raise AssertionError(errmsg)
17321814

1733-
ping = Grouping(group_axis, gpr, obj=obj, axis=axis, name=name, level=level, sort=sort)
1815+
ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort)
17341816
groupings.append(ping)
17351817

17361818
if len(groupings) == 0:
17371819
raise ValueError('No group keys passed!')
17381820

1739-
grouper = Grouper(group_axis, groupings, sort=sort)
1821+
# create the internals grouper
1822+
grouper = BaseGrouper(group_axis, groupings, sort=sort)
17401823

17411824
return grouper, exclusions, obj
17421825

pandas/tests/test_groupby.py

+22-5
Original file line numberDiff line numberDiff line change
@@ -2991,19 +2991,36 @@ def test_timegrouper_with_reg_groups(self):
29912991

29922992
# passing the name
29932993
df = df.reset_index()
2994-
result = df.groupby([pd.TimeGrouper('1M',name='Date'),'Buyer']).sum()
2994+
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
29952995
assert_frame_equal(result,expected)
29962996

2997-
self.assertRaises(KeyError, lambda : df.groupby([pd.TimeGrouper('1M',name='foo'),'Buyer']).sum())
2997+
self.assertRaises(KeyError, lambda : df.groupby([pd.Grouper(freq='1M',key='foo'),'Buyer']).sum())
29982998

29992999
# passing the level
30003000
df = df.set_index('Date')
3001-
result = df.groupby([pd.TimeGrouper('1M',level='Date'),'Buyer']).sum()
3001+
result = df.groupby([pd.Grouper(freq='1M',level='Date'),'Buyer']).sum()
30023002
assert_frame_equal(result,expected)
3003-
result = df.groupby([pd.TimeGrouper('1M',level=0),'Buyer']).sum()
3003+
result = df.groupby([pd.Grouper(freq='1M',level=0),'Buyer']).sum()
30043004
assert_frame_equal(result,expected)
30053005

3006-
self.assertRaises(ValueError, lambda : df.groupby([pd.TimeGrouper('1M',level='foo'),'Buyer']).sum())
3006+
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',level='foo'),'Buyer']).sum())
3007+
3008+
# multi names
3009+
df = df.copy()
3010+
df['Date'] = df.index + pd.offsets.MonthEnd(2)
3011+
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
3012+
expected = DataFrame({
3013+
'Buyer': 'Carl Joe Mark'.split(),
3014+
'Quantity': [10,18,3],
3015+
'Date' : [
3016+
DT.datetime(2013,11,30,0,0),
3017+
DT.datetime(2013,11,30,0,0),
3018+
DT.datetime(2013,11,30,0,0),
3019+
]}).set_index(['Date','Buyer'])
3020+
assert_frame_equal(result,expected)
3021+
3022+
# error as we have both a level and a name!
3023+
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum())
30073024

30083025
def test_cumcount(self):
30093026
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])

pandas/tseries/resample.py

+7-10
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import numpy as np
44

5-
from pandas.core.groupby import BinGrouper, CustomGrouper
5+
from pandas.core.groupby import BinGrouper, Grouper
66
from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod
77
from pandas.tseries.index import DatetimeIndex, date_range
88
from pandas.tseries.offsets import DateOffset, Tick, _delta_to_nanoseconds
@@ -18,7 +18,7 @@
1818
_DEFAULT_METHOD = 'mean'
1919

2020

21-
class TimeGrouper(CustomGrouper):
21+
class TimeGrouper(Grouper):
2222
"""
2323
Custom groupby class for time-interval grouping
2424
@@ -30,8 +30,6 @@ class TimeGrouper(CustomGrouper):
3030
nperiods : optional, integer
3131
convention : {'start', 'end', 'e', 's'}
3232
If axis is PeriodIndex
33-
name : referring name, default None
34-
level : referering level, default None
3533
3634
Notes
3735
-----
@@ -41,11 +39,11 @@ class TimeGrouper(CustomGrouper):
4139
def __init__(self, freq='Min', closed=None, label=None, how='mean',
4240
nperiods=None, axis=0,
4341
fill_method=None, limit=None, loffset=None, kind=None,
44-
convention=None, base=0, name=None, level=None):
45-
self.freq = to_offset(freq)
42+
convention=None, base=0, **kwargs):
43+
freq = to_offset(freq)
4644

4745
end_types = set(['M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W'])
48-
rule = self.freq.rule_code
46+
rule = freq.rule_code
4947
if (rule in end_types or
5048
('-' in rule and rule[:rule.find('-')] in end_types)):
5149
if closed is None:
@@ -66,14 +64,13 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean',
6664
self.convention = convention or 'E'
6765
self.convention = self.convention.lower()
6866

69-
self.axis = axis
7067
self.loffset = loffset
7168
self.how = how
7269
self.fill_method = fill_method
7370
self.limit = limit
7471
self.base = base
75-
self.name = name
76-
self.level = level
72+
73+
super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs)
7774

7875
def resample(self, obj):
7976
ax = obj._get_axis(self.axis)

0 commit comments

Comments
 (0)