Skip to content

Commit a316f2f

Browse files
committed
BUG/API: allow TimeGrouper with other columns in a groupby (GH3794)
1 parent e19b2eb commit a316f2f

File tree

5 files changed

+166
-6
lines changed

5 files changed

+166
-6
lines changed

pandas/core/groupby.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -1479,6 +1479,8 @@ class Grouping(object):
14791479
----------
14801480
index : Index
14811481
grouper :
1482+
obj :
1483+
axis :
14821484
name :
14831485
level :
14841486
@@ -1493,7 +1495,7 @@ class Grouping(object):
14931495
* groups : dict of {group -> label_list}
14941496
"""
14951497

1496-
def __init__(self, index, grouper=None, name=None, level=None,
1498+
def __init__(self, index, grouper=None, obj=None, axis=0, name=None, level=None,
14971499
sort=True):
14981500

14991501
self.name = name
@@ -1566,6 +1568,30 @@ def __init__(self, index, grouper=None, name=None, level=None,
15661568
if self.name is None:
15671569
self.name = factor.name
15681570

1571+
# a passed TimeGrouper like
1572+
elif isinstance(self.grouper, CustomGrouper):
1573+
1574+
# get the obj to work on
1575+
if self.grouper.name is not None:
1576+
name = self.grouper.name
1577+
if name not in obj._info_axis:
1578+
raise KeyError("The grouper name {0} is not found".format(name))
1579+
ax = Index(obj[name],name=name)
1580+
else:
1581+
ax = obj._get_axis(axis)
1582+
if self.grouper.level is not None:
1583+
level = self.grouper.level
1584+
if isinstance(ax, MultiIndex):
1585+
level = ax._get_level_name(level)
1586+
ax = Index(ax.get_level_values(level), name=level)
1587+
else:
1588+
if not (level == 0 or level == ax.name):
1589+
raise ValueError("The grouper level {0} is not valid".format(level))
1590+
1591+
self.grouper = self.grouper._get_grouper_for_ax(ax)
1592+
if self.name is None:
1593+
self.name = self.grouper.name
1594+
15691595
# no level passed
15701596
if not isinstance(self.grouper, (Series, np.ndarray)):
15711597
self.grouper = self.index.map(self.grouper)
@@ -1704,7 +1730,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
17041730
errmsg = "Categorical grouper must have len(grouper) == len(data)"
17051731
raise AssertionError(errmsg)
17061732

1707-
ping = Grouping(group_axis, gpr, name=name, level=level, sort=sort)
1733+
ping = Grouping(group_axis, gpr, obj=obj, axis=axis, name=name, level=level, sort=sort)
17081734
groupings.append(ping)
17091735

17101736
if len(groupings) == 0:

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2031,7 +2031,7 @@ def reindex_axis(self, labels, axis=0, **kwargs):
20312031
raise ValueError("cannot reindex series on non-zero axis!")
20322032
return self.reindex(index=labels, **kwargs)
20332033

2034-
def take(self, indices, axis=0, convert=True):
2034+
def take(self, indices, axis=0, convert=True, is_copy=False):
20352035
"""
20362036
Analogous to ndarray.take, return Series corresponding to requested
20372037
indices

pandas/tests/test_groupby.py

+102
Original file line numberDiff line numberDiff line change
@@ -2903,6 +2903,108 @@ def test_groupby_with_timegrouper_methods(self):
29032903
self.assertTrue(isinstance(groups,dict))
29042904
self.assertTrue(len(groups) == 3)
29052905

2906+
def test_timegrouper_with_reg_groups(self):
2907+
2908+
# GH 3794
2909+
# allow combinateion of timegrouper/reg groups
2910+
2911+
import datetime as DT
2912+
2913+
df = DataFrame({
2914+
'Branch' : 'A A A A A A A B'.split(),
2915+
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
2916+
'Quantity': [1,3,5,1,8,1,9,3],
2917+
'Date' : [
2918+
DT.datetime(2013,1,1,13,0),
2919+
DT.datetime(2013,1,1,13,5),
2920+
DT.datetime(2013,10,1,20,0),
2921+
DT.datetime(2013,10,2,10,0),
2922+
DT.datetime(2013,10,1,20,0),
2923+
DT.datetime(2013,10,2,10,0),
2924+
DT.datetime(2013,12,2,12,0),
2925+
DT.datetime(2013,12,2,14,0),
2926+
]}).set_index('Date')
2927+
2928+
expected = DataFrame({
2929+
'Buyer': 'Carl Joe Mark'.split(),
2930+
'Quantity': [10,18,3],
2931+
'Date' : [
2932+
DT.datetime(2013,12,31,0,0),
2933+
DT.datetime(2013,12,31,0,0),
2934+
DT.datetime(2013,12,31,0,0),
2935+
]}).set_index(['Date','Buyer'])
2936+
result = df.groupby([pd.TimeGrouper('A'),'Buyer']).sum()
2937+
assert_frame_equal(result,expected)
2938+
2939+
expected = DataFrame({
2940+
'Buyer': 'Carl Mark Carl Joe'.split(),
2941+
'Quantity': [1,3,9,18],
2942+
'Date' : [
2943+
DT.datetime(2013,1,1,0,0),
2944+
DT.datetime(2013,1,1,0,0),
2945+
DT.datetime(2013,7,1,0,0),
2946+
DT.datetime(2013,7,1,0,0),
2947+
]}).set_index(['Date','Buyer'])
2948+
2949+
result = df.groupby([pd.TimeGrouper('6MS'),'Buyer']).sum()
2950+
assert_frame_equal(result,expected)
2951+
2952+
df = DataFrame({
2953+
'Branch' : 'A A A A A A A B'.split(),
2954+
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
2955+
'Quantity': [1,3,5,1,8,1,9,3],
2956+
'Date' : [
2957+
DT.datetime(2013,10,1,13,0),
2958+
DT.datetime(2013,10,1,13,5),
2959+
DT.datetime(2013,10,1,20,0),
2960+
DT.datetime(2013,10,2,10,0),
2961+
DT.datetime(2013,10,1,20,0),
2962+
DT.datetime(2013,10,2,10,0),
2963+
DT.datetime(2013,10,2,12,0),
2964+
DT.datetime(2013,10,2,14,0),
2965+
]}).set_index('Date')
2966+
2967+
expected = DataFrame({
2968+
'Buyer': 'Carl Joe Mark Carl Joe'.split(),
2969+
'Quantity': [6,8,3,4,10],
2970+
'Date' : [
2971+
DT.datetime(2013,10,1,0,0),
2972+
DT.datetime(2013,10,1,0,0),
2973+
DT.datetime(2013,10,1,0,0),
2974+
DT.datetime(2013,10,2,0,0),
2975+
DT.datetime(2013,10,2,0,0),
2976+
]}).set_index(['Date','Buyer'])
2977+
2978+
result = df.groupby([pd.TimeGrouper('1D'),'Buyer']).sum()
2979+
assert_frame_equal(result,expected)
2980+
2981+
result = df.groupby([pd.TimeGrouper('1M'),'Buyer']).sum()
2982+
expected = DataFrame({
2983+
'Buyer': 'Carl Joe Mark'.split(),
2984+
'Quantity': [10,18,3],
2985+
'Date' : [
2986+
DT.datetime(2013,10,31,0,0),
2987+
DT.datetime(2013,10,31,0,0),
2988+
DT.datetime(2013,10,31,0,0),
2989+
]}).set_index(['Date','Buyer'])
2990+
assert_frame_equal(result,expected)
2991+
2992+
# passing the name
2993+
df = df.reset_index()
2994+
result = df.groupby([pd.TimeGrouper('1M',name='Date'),'Buyer']).sum()
2995+
assert_frame_equal(result,expected)
2996+
2997+
self.assertRaises(KeyError, lambda : df.groupby([pd.TimeGrouper('1M',name='foo'),'Buyer']).sum())
2998+
2999+
# passing the level
3000+
df = df.set_index('Date')
3001+
result = df.groupby([pd.TimeGrouper('1M',level='Date'),'Buyer']).sum()
3002+
assert_frame_equal(result,expected)
3003+
result = df.groupby([pd.TimeGrouper('1M',level=0),'Buyer']).sum()
3004+
assert_frame_equal(result,expected)
3005+
3006+
self.assertRaises(ValueError, lambda : df.groupby([pd.TimeGrouper('1M',level='foo'),'Buyer']).sum())
3007+
29063008
def test_cumcount(self):
29073009
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
29083010
g = df.groupby('A')

pandas/tseries/resample.py

+35-2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ class TimeGrouper(CustomGrouper):
3030
nperiods : optional, integer
3131
convention : {'start', 'end', 'e', 's'}
3232
If axis is PeriodIndex
33+
name : referring name, default None
34+
level : referering level, default None
3335
3436
Notes
3537
-----
@@ -39,7 +41,7 @@ class TimeGrouper(CustomGrouper):
3941
def __init__(self, freq='Min', closed=None, label=None, how='mean',
4042
nperiods=None, axis=0,
4143
fill_method=None, limit=None, loffset=None, kind=None,
42-
convention=None, base=0):
44+
convention=None, base=0, name=None, level=None):
4345
self.freq = to_offset(freq)
4446

4547
end_types = set(['M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W'])
@@ -70,6 +72,8 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean',
7072
self.fill_method = fill_method
7173
self.limit = limit
7274
self.base = base
75+
self.name = name
76+
self.level = level
7377

7478
def resample(self, obj):
7579
ax = obj._get_axis(self.axis)
@@ -103,13 +107,42 @@ def get_grouper(self, obj):
103107
# return a tuple of (binner, grouper, obj)
104108
return self._get_time_grouper(obj)
105109

110+
def _get_grouper_for_ax(self, ax):
111+
# return an ordering of the transformed group labels,
112+
# suitable for multi-grouping, e.g the labels for
113+
# the resampled intervals
114+
115+
indexer = None
116+
if not ax.is_monotonic:
117+
indexer = ax.argsort(kind='quicksort')
118+
ax = ax.take(indexer)
119+
120+
if self.kind is None or self.kind == 'timestamp':
121+
binner, bins, binlabels = self._get_time_bins(ax)
122+
else:
123+
binner, bins, binlabels = self._get_time_period_bins(ax)
124+
125+
grp = BinGrouper(bins, binlabels)
126+
127+
# create the grouper
128+
l = []
129+
for key, group in grp.get_iterator(ax):
130+
l.extend([key]*len(group))
131+
grouper = binner.__class__(l,freq=binner.freq,name=binner.name)
132+
133+
# since we may have had to sort
134+
# may need to reorder groups here
135+
if indexer is not None:
136+
grouper = grouper.take(indexer)
137+
return grouper
138+
106139
def _ensure_sortedness(self, obj):
107140
# ensure that our object is sorted
108141
ax = obj._get_axis(self.axis)
109142
if not ax.is_monotonic:
110143
try:
111144
obj = obj.sort_index(axis=self.axis)
112-
except TypeError:
145+
except:
113146
obj = obj.sort_index()
114147
return obj
115148

pandas/tseries/tests/test_resample.py

-1
Original file line numberDiff line numberDiff line change
@@ -1137,7 +1137,6 @@ def test_apply_iteration(self):
11371137
_, grouper, _ = tg.get_grouper(df)
11381138

11391139
# Errors
1140-
11411140
grouped = df.groupby(grouper, group_keys=False)
11421141
f = lambda df: df['close'] / df['open']
11431142

0 commit comments

Comments
 (0)