Skip to content

Commit 26312e1

Browse files
committed
Merge pull request #6350 from jreback/groupby_sorted
BUG: TimeGrouper sortedness / API fix (GH4161,GH3881)
2 parents d362df0 + cb8ae6e commit 26312e1

File tree

5 files changed

+133
-48
lines changed

5 files changed

+133
-48
lines changed

doc/source/release.rst

+3
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ Bug Fixes
9797
``to_replace`` argument (:issue:`6332`)
9898
- Raise when trying to align on different levels of a multi-index assignment (:issue:`3738`)
9999
- Bug in setting complex dtypes via boolean indexing (:issue:`6345`)
100+
- Bug in TimeGrouper/resample when presented with a non-monotonic DatetimeIndex would return invalid results. (:issue:`4161`)
101+
- Bug in index name propogation in TimeGrouper/resample (:issue:`4161`)
102+
- TimeGrouper has a more compatible API to the rest of the groupers (e.g. ``groups`` was missing) (:issue:`3881`)
100103

101104
pandas 0.13.1
102105
-------------

pandas/core/groupby.py

+22-8
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,6 @@ def __init__(self, obj, keys=None, axis=0, level=None,
217217
if isinstance(obj, NDFrame):
218218
obj._consolidate_inplace()
219219

220-
self.obj = obj
221-
self.axis = obj._get_axis_number(axis)
222220
self.level = level
223221

224222
if not as_index:
@@ -234,9 +232,11 @@ def __init__(self, obj, keys=None, axis=0, level=None,
234232
self.squeeze = squeeze
235233

236234
if grouper is None:
237-
grouper, exclusions = _get_grouper(obj, keys, axis=axis,
238-
level=level, sort=sort)
235+
grouper, exclusions, obj = _get_grouper(obj, keys, axis=axis,
236+
level=level, sort=sort)
239237

238+
self.obj = obj
239+
self.axis = obj._get_axis_number(axis)
240240
self.grouper = grouper
241241
self.exclusions = set(exclusions) if exclusions else set()
242242

@@ -1106,6 +1106,7 @@ def aggregate(self, values, how, axis=0):
11061106

11071107
# will be filled in Cython function
11081108
result = np.empty(out_shape, dtype=values.dtype)
1109+
result.fill(np.nan)
11091110
counts = np.zeros(self.ngroups, dtype=np.int64)
11101111

11111112
result = self._aggregate(result, counts, values, how, is_numeric)
@@ -1258,6 +1259,11 @@ class CustomGrouper(object):
12581259
def get_grouper(self, obj):
12591260
raise NotImplementedError
12601261

1262+
# delegates
1263+
@property
1264+
def groups(self):
1265+
return self.grouper.groups
1266+
12611267

12621268
class BinGrouper(Grouper):
12631269

@@ -1266,6 +1272,14 @@ def __init__(self, bins, binlabels, filter_empty=False):
12661272
self.binlabels = _ensure_index(binlabels)
12671273
self._filter_empty_groups = filter_empty
12681274

1275+
@cache_readonly
1276+
def groups(self):
1277+
""" dict {group name -> group labels} """
1278+
1279+
# this is mainly for compat
1280+
# GH 3881
1281+
return dict(zip(self.binlabels,self.bins))
1282+
12691283
@property
12701284
def nkeys(self):
12711285
return 1
@@ -1560,10 +1574,10 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
15601574
key = group_axis
15611575

15621576
if isinstance(key, CustomGrouper):
1563-
gpr = key.get_grouper(obj)
1564-
return gpr, []
1577+
binner, gpr, obj = key.get_grouper(obj)
1578+
return gpr, [], obj
15651579
elif isinstance(key, Grouper):
1566-
return key, []
1580+
return key, [], obj
15671581

15681582
if not isinstance(key, (tuple, list)):
15691583
keys = [key]
@@ -1623,7 +1637,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
16231637

16241638
grouper = Grouper(group_axis, groupings, sort=sort)
16251639

1626-
return grouper, exclusions
1640+
return grouper, exclusions, obj
16271641

16281642

16291643
def _is_label_like(val):

pandas/tests/test_groupby.py

+58
Original file line numberDiff line numberDiff line change
@@ -2764,6 +2764,64 @@ def test_groupby_with_empty(self):
27642764
grouped = series.groupby(grouper)
27652765
assert next(iter(grouped), None) is None
27662766

2767+
def test_groupby_with_timegrouper(self):
2768+
# GH 4161
2769+
# TimeGrouper requires a sorted index
2770+
# also verifies that the resultant index has the correct name
2771+
import datetime as DT
2772+
df = DataFrame({
2773+
'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
2774+
'Quantity': [18,3,5,1,9,3],
2775+
'Date' : [
2776+
DT.datetime(2013,9,1,13,0),
2777+
DT.datetime(2013,9,1,13,5),
2778+
DT.datetime(2013,10,1,20,0),
2779+
DT.datetime(2013,10,3,10,0),
2780+
DT.datetime(2013,12,2,12,0),
2781+
DT.datetime(2013,9,2,14,0),
2782+
]})
2783+
df = df.set_index(['Date'])
2784+
2785+
expected = DataFrame({ 'Quantity' : np.nan },
2786+
index=date_range('20130901 13:00:00','20131205 13:00:00',freq='5D',name='Date'))
2787+
expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64')
2788+
2789+
result1 = df.resample('5D',how=sum)
2790+
assert_frame_equal(result1, expected)
2791+
2792+
df_sorted = df.sort_index()
2793+
result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum()
2794+
assert_frame_equal(result2, expected)
2795+
2796+
result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum()
2797+
assert_frame_equal(result3, expected)
2798+
2799+
def test_groupby_with_timegrouper_methods(self):
2800+
# GH 3881
2801+
# make sure API of timegrouper conforms
2802+
2803+
import datetime as DT
2804+
df = pd.DataFrame({
2805+
'Branch' : 'A A A A A B'.split(),
2806+
'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
2807+
'Quantity': [1,3,5,8,9,3],
2808+
'Date' : [
2809+
DT.datetime(2013,1,1,13,0),
2810+
DT.datetime(2013,1,1,13,5),
2811+
DT.datetime(2013,10,1,20,0),
2812+
DT.datetime(2013,10,2,10,0),
2813+
DT.datetime(2013,12,2,12,0),
2814+
DT.datetime(2013,12,2,14,0),
2815+
]})
2816+
2817+
df = df.set_index('Date', drop=False)
2818+
g = df.groupby(pd.TimeGrouper('6M'))
2819+
self.assertTrue(g.group_keys)
2820+
self.assertTrue(isinstance(g.grouper,pd.core.groupby.BinGrouper))
2821+
groups = g.groups
2822+
self.assertTrue(isinstance(groups,dict))
2823+
self.assertTrue(len(groups) == 3)
2824+
27672825
def test_cumcount(self):
27682826
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
27692827
g = df.groupby('A')

pandas/tseries/resample.py

+48-38
Original file line numberDiff line numberDiff line change
@@ -72,17 +72,12 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean',
7272
self.base = base
7373

7474
def resample(self, obj):
75-
axis = obj._get_axis(self.axis)
75+
ax = obj._get_axis(self.axis)
7676

77-
if not axis.is_monotonic:
78-
try:
79-
obj = obj.sort_index(axis=self.axis)
80-
except TypeError:
81-
obj = obj.sort_index()
82-
83-
if isinstance(axis, DatetimeIndex):
77+
obj = self._ensure_sortedness(obj)
78+
if isinstance(ax, DatetimeIndex):
8479
rs = self._resample_timestamps(obj)
85-
elif isinstance(axis, PeriodIndex):
80+
elif isinstance(ax, PeriodIndex):
8681
offset = to_offset(self.freq)
8782
if offset.n > 1:
8883
if self.kind == 'period': # pragma: no cover
@@ -95,55 +90,68 @@ def resample(self, obj):
9590
else:
9691
obj = obj.to_timestamp(how=self.convention)
9792
rs = self._resample_timestamps(obj)
98-
elif len(axis) == 0:
93+
elif len(ax) == 0:
9994
return obj
10095
else: # pragma: no cover
10196
raise TypeError('Only valid with DatetimeIndex or PeriodIndex')
10297

10398
rs_axis = rs._get_axis(self.axis)
104-
rs_axis.name = axis.name
99+
rs_axis.name = ax.name
105100
return rs
106101

107102
def get_grouper(self, obj):
108-
# Only return grouper
109-
return self._get_time_grouper(obj)[1]
103+
# return a tuple of (binner, grouper, obj)
104+
return self._get_time_grouper(obj)
105+
106+
def _ensure_sortedness(self, obj):
107+
# ensure that our object is sorted
108+
ax = obj._get_axis(self.axis)
109+
if not ax.is_monotonic:
110+
try:
111+
obj = obj.sort_index(axis=self.axis)
112+
except TypeError:
113+
obj = obj.sort_index()
114+
return obj
110115

111116
def _get_time_grouper(self, obj):
112-
axis = obj._get_axis(self.axis)
117+
obj = self._ensure_sortedness(obj)
118+
ax = obj._get_axis(self.axis)
113119

114120
if self.kind is None or self.kind == 'timestamp':
115-
binner, bins, binlabels = self._get_time_bins(axis)
121+
binner, bins, binlabels = self._get_time_bins(ax)
116122
else:
117-
binner, bins, binlabels = self._get_time_period_bins(axis)
123+
binner, bins, binlabels = self._get_time_period_bins(ax)
118124

119125
grouper = BinGrouper(bins, binlabels)
120-
return binner, grouper
126+
return binner, grouper, obj
121127

122-
def _get_time_bins(self, axis):
123-
if not isinstance(axis, DatetimeIndex):
128+
def _get_time_bins(self, ax):
129+
if not isinstance(ax, DatetimeIndex):
124130
raise TypeError('axis must be a DatetimeIndex, but got '
125-
'an instance of %r' % type(axis).__name__)
131+
'an instance of %r' % type(ax).__name__)
126132

127-
if len(axis) == 0:
128-
binner = labels = DatetimeIndex(data=[], freq=self.freq)
133+
if len(ax) == 0:
134+
binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name)
129135
return binner, [], labels
130136

131-
first, last = _get_range_edges(axis, self.freq, closed=self.closed,
137+
first, last = _get_range_edges(ax, self.freq, closed=self.closed,
132138
base=self.base)
133-
tz = axis.tz
139+
tz = ax.tz
134140
binner = labels = DatetimeIndex(freq=self.freq,
135141
start=first.replace(tzinfo=None),
136-
end=last.replace(tzinfo=None), tz=tz)
142+
end=last.replace(tzinfo=None),
143+
tz=tz,
144+
name=ax.name)
137145

138146
# a little hack
139147
trimmed = False
140-
if (len(binner) > 2 and binner[-2] == axis[-1] and
148+
if (len(binner) > 2 and binner[-2] == ax[-1] and
141149
self.closed == 'right'):
142150

143151
binner = binner[:-1]
144152
trimmed = True
145153

146-
ax_values = axis.asi8
154+
ax_values = ax.asi8
147155
binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
148156

149157
# general version, knowing nothing about relative frequencies
@@ -180,22 +188,24 @@ def _adjust_bin_edges(self, binner, ax_values):
180188

181189
return binner, bin_edges
182190

183-
def _get_time_period_bins(self, axis):
184-
if not isinstance(axis, DatetimeIndex):
191+
def _get_time_period_bins(self, ax):
192+
if not isinstance(ax, DatetimeIndex):
185193
raise TypeError('axis must be a DatetimeIndex, but got '
186-
'an instance of %r' % type(axis).__name__)
194+
'an instance of %r' % type(ax).__name__)
187195

188-
if not len(axis):
189-
binner = labels = PeriodIndex(data=[], freq=self.freq)
196+
if not len(ax):
197+
binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name)
190198
return binner, [], labels
191199

192-
labels = binner = PeriodIndex(start=axis[0], end=axis[-1],
193-
freq=self.freq)
200+
labels = binner = PeriodIndex(start=ax[0],
201+
end=ax[-1],
202+
freq=self.freq,
203+
name=ax.name)
194204

195205
end_stamps = (labels + 1).asfreq(self.freq, 's').to_timestamp()
196-
if axis.tzinfo:
197-
end_stamps = end_stamps.tz_localize(axis.tzinfo)
198-
bins = axis.searchsorted(end_stamps, side='left')
206+
if ax.tzinfo:
207+
end_stamps = end_stamps.tz_localize(ax.tzinfo)
208+
bins = ax.searchsorted(end_stamps, side='left')
199209

200210
return binner, bins, labels
201211

@@ -206,7 +216,7 @@ def _agg_method(self):
206216
def _resample_timestamps(self, obj):
207217
axlabels = obj._get_axis(self.axis)
208218

209-
binner, grouper = self._get_time_grouper(obj)
219+
binner, grouper, _ = self._get_time_grouper(obj)
210220

211221
# Determine if we're downsampling
212222
if axlabels.freq is not None or axlabels.inferred_freq is not None:

pandas/tseries/tests/test_resample.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1100,7 +1100,7 @@ def test_apply_iteration(self):
11001100
df = DataFrame({'open': 1, 'close': 2}, index=ind)
11011101
tg = TimeGrouper('M')
11021102

1103-
grouper = tg.get_grouper(df)
1103+
_, grouper, _ = tg.get_grouper(df)
11041104

11051105
# Errors
11061106

@@ -1118,7 +1118,7 @@ def test_panel_aggregation(self):
11181118
minor_axis=['A', 'B', 'C', 'D'])
11191119

11201120
tg = TimeGrouper('M', axis=1)
1121-
grouper = tg.get_grouper(wp)
1121+
_, grouper, _ = tg.get_grouper(wp)
11221122
bingrouped = wp.groupby(grouper)
11231123
binagg = bingrouped.mean()
11241124

0 commit comments

Comments
 (0)