Skip to content

Commit 7ffa655

Browse files
committed
Merge pull request pandas-dev#6690 from jreback/resample_bin
BUG: Bug in resample with extra bins when using an evenly divisible freq (GH4076)
2 parents 83b1ce4 + 4214a17 commit 7ffa655

File tree

5 files changed

+55
-20
lines changed

5 files changed

+55
-20
lines changed

doc/source/release.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ Improvements to existing features
185185
- Performance improvement when converting ``DatetimeIndex`` to floating ordinals
186186
using ``DatetimeConverter`` (:issue:`6636`)
187187
- Performance improvement for ``DataFrame.shift`` (:issue: `5609`)
188-
188+
189189
.. _release.bug_fixes-0.14.0:
190190

191191
Bug Fixes
@@ -270,6 +270,7 @@ Bug Fixes
270270
- Bug in compat with ``np.compress``, surfaced in (:issue:`6658`)
271271
- Bug in binary operations with a rhs of a Series not aligning (:issue:`6681`)
272272
- Bug in ``DataFrame.to_stata`` which incorrectly handles nan values and ignores 'with_index' keyword argument (:issue:`6685`)
273+
- Bug in resample with extra bins when using an evenly divisible frequency (:issue:`4076`)
273274

274275
pandas 0.13.1
275276
-------------

pandas/core/groupby.py

+13-18
Original file line numberDiff line numberDiff line change
@@ -1486,25 +1486,20 @@ def get_iterator(self, data, axis=0):
14861486
Generator yielding sequence of (name, subsetted object)
14871487
for each group
14881488
"""
1489-
if axis == 0:
1490-
start = 0
1491-
for edge, label in zip(self.bins, self.binlabels):
1492-
yield label, data[start:edge]
1493-
start = edge
1494-
1495-
if start < len(data):
1496-
yield self.binlabels[-1], data[start:]
1489+
if isinstance(data, NDFrame):
1490+
slicer = lambda start,edge: data._slice(slice(start,edge),axis=axis)
1491+
length = len(data.axes[axis])
14971492
else:
1498-
start = 0
1499-
for edge, label in zip(self.bins, self.binlabels):
1500-
inds = lrange(start, edge)
1501-
yield label, data.take(inds, axis=axis)
1502-
start = edge
1503-
1504-
n = len(data.axes[axis])
1505-
if start < n:
1506-
inds = lrange(start, n)
1507-
yield self.binlabels[-1], data.take(inds, axis=axis)
1493+
slicer = lambda start,edge: data[slice(start,edge)]
1494+
length = len(data)
1495+
1496+
start = 0
1497+
for edge, label in zip(self.bins, self.binlabels):
1498+
yield label, slicer(start,edge)
1499+
start = edge
1500+
1501+
if start < length:
1502+
yield self.binlabels[-1], slicer(start,None)
15081503

15091504
def apply(self, f, data, axis=0):
15101505
result_keys = []

pandas/tests/test_groupby.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2864,7 +2864,8 @@ def test_groupby_with_timegrouper(self):
28642864
df = df.set_index(['Date'])
28652865

28662866
expected = DataFrame({ 'Quantity' : np.nan },
2867-
index=date_range('20130901 13:00:00','20131205 13:00:00',freq='5D',name='Date'))
2867+
index=date_range('20130901 13:00:00','20131205 13:00:00',
2868+
freq='5D',name='Date',closed='left'))
28682869
expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64')
28692870

28702871
result1 = df.resample('5D',how=sum)

pandas/tseries/resample.py

+6
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,12 @@ def _get_time_bins(self, ax):
185185
elif not trimmed:
186186
labels = labels[:-1]
187187

188+
# if we end up with more labels than bins
189+
# adjust the labels
190+
# GH4076
191+
if len(bins) < len(labels):
192+
labels = labels[:len(bins)]
193+
188194
return binner, bins, labels
189195

190196
def _adjust_bin_edges(self, binner, ax_values):

pandas/tseries/tests/test_resample.py

+32
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,38 @@ def test_resample_doesnt_truncate(self):
10871087
result = series.resample('D')
10881088
self.assertEquals(result.index[0], dates[0])
10891089

1090+
def test_evenly_divisible_with_no_extra_bins(self):
1091+
# 4076
1092+
# when the frequency is evenly divisible, sometimes extra bins
1093+
1094+
df = DataFrame(np.random.randn(9, 3), index=date_range('2000-1-1', periods=9))
1095+
result = df.resample('5D')
1096+
expected = pd.concat([df.iloc[0:5].mean(),df.iloc[5:].mean()],axis=1).T
1097+
expected.index = [Timestamp('2000-1-1'),Timestamp('2000-1-6')]
1098+
assert_frame_equal(result,expected)
1099+
1100+
index = date_range(start='2001-5-4', periods=28)
1101+
df = DataFrame(
1102+
[{'REST_KEY': 1, 'DLY_TRN_QT': 80, 'DLY_SLS_AMT': 90,
1103+
'COOP_DLY_TRN_QT': 30, 'COOP_DLY_SLS_AMT': 20}] * 28 +
1104+
[{'REST_KEY': 2, 'DLY_TRN_QT': 70, 'DLY_SLS_AMT': 10,
1105+
'COOP_DLY_TRN_QT': 50, 'COOP_DLY_SLS_AMT': 20}] * 28,
1106+
index=index.append(index)).sort()
1107+
1108+
index = date_range('2001-5-4',periods=4,freq='7D')
1109+
expected = DataFrame(
1110+
[{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14,
1111+
'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4,
1112+
index=index).unstack().swaplevel(1,0).sortlevel()
1113+
result = df.resample('7D', how='count')
1114+
assert_series_equal(result,expected)
1115+
1116+
expected = DataFrame(
1117+
[{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700,
1118+
'COOP_DLY_TRN_QT': 560, 'COOP_DLY_SLS_AMT': 280}] * 4,
1119+
index=index)
1120+
result = df.resample('7D', how='sum')
1121+
assert_frame_equal(result,expected)
10901122

10911123
class TestTimeGrouper(tm.TestCase):
10921124

0 commit comments

Comments
 (0)