From 4214a17a5dfbf4e38afaef784358eefcd1e6be59 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 22 Mar 2014 18:11:14 -0400 Subject: [PATCH] BUG: Bug in resample with extra bins when using an evenly divisible frequency (GH4076) --- doc/source/release.rst | 3 ++- pandas/core/groupby.py | 31 +++++++++++--------------- pandas/tests/test_groupby.py | 3 ++- pandas/tseries/resample.py | 6 +++++ pandas/tseries/tests/test_resample.py | 32 +++++++++++++++++++++++++++ 5 files changed, 55 insertions(+), 20 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 55861a0f1b0f0..df0f472c390c7 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -185,7 +185,7 @@ Improvements to existing features - Performance improvement when converting ``DatetimeIndex`` to floating ordinals using ``DatetimeConverter`` (:issue:`6636`) - Performance improvement for ``DataFrame.shift`` (:issue: `5609`) - + .. _release.bug_fixes-0.14.0: Bug Fixes @@ -270,6 +270,7 @@ Bug Fixes - Bug in compat with ``np.compress``, surfaced in (:issue:`6658`) - Bug in binary operations with a rhs of a Series not aligning (:issue:`6681`) - Bug in ``DataFrame.to_stata`` which incorrectly handles nan values and ignores 'with_index' keyword argument (:issue:`6685`) +- Bug in resample with extra bins when using an evenly divisible frequency (:issue:`4076`) pandas 0.13.1 ------------- diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 15e6381cbe2fa..83964571fca8f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1486,25 +1486,20 @@ def get_iterator(self, data, axis=0): Generator yielding sequence of (name, subsetted object) for each group """ - if axis == 0: - start = 0 - for edge, label in zip(self.bins, self.binlabels): - yield label, data[start:edge] - start = edge - - if start < len(data): - yield self.binlabels[-1], data[start:] + if isinstance(data, NDFrame): + slicer = lambda start,edge: data._slice(slice(start,edge),axis=axis) + length = len(data.axes[axis]) else: - start = 0 - for edge, label in zip(self.bins, self.binlabels): - inds = lrange(start, edge) - yield label, data.take(inds, axis=axis) - start = edge - - n = len(data.axes[axis]) - if start < n: - inds = lrange(start, n) - yield self.binlabels[-1], data.take(inds, axis=axis) + slicer = lambda start,edge: data[slice(start,edge)] + length = len(data) + + start = 0 + for edge, label in zip(self.bins, self.binlabels): + yield label, slicer(start,edge) + start = edge + + if start < length: + yield self.binlabels[-1], slicer(start,None) def apply(self, f, data, axis=0): result_keys = [] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 4d47750660800..506eb348a8113 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2864,7 +2864,8 @@ def test_groupby_with_timegrouper(self): df = df.set_index(['Date']) expected = DataFrame({ 'Quantity' : np.nan }, - index=date_range('20130901 13:00:00','20131205 13:00:00',freq='5D',name='Date')) + index=date_range('20130901 13:00:00','20131205 13:00:00', + freq='5D',name='Date',closed='left')) expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64') result1 = df.resample('5D',how=sum) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 51144cb3bba2c..8b65882fb1279 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -185,6 +185,12 @@ def _get_time_bins(self, ax): elif not trimmed: labels = labels[:-1] + # if we end up with more labels than bins + # adjust the labels + # GH4076 + if len(bins) < len(labels): + labels = labels[:len(bins)] + return binner, bins, labels def _adjust_bin_edges(self, binner, ax_values): diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 242d656b8794f..55d96ec6fbaeb 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1087,6 +1087,38 @@ def test_resample_doesnt_truncate(self): result = series.resample('D') self.assertEquals(result.index[0], dates[0]) + def test_evenly_divisible_with_no_extra_bins(self): + # 4076 + # when the frequency is evenly divisible, sometimes extra bins + + df = DataFrame(np.random.randn(9, 3), index=date_range('2000-1-1', periods=9)) + result = df.resample('5D') + expected = pd.concat([df.iloc[0:5].mean(),df.iloc[5:].mean()],axis=1).T + expected.index = [Timestamp('2000-1-1'),Timestamp('2000-1-6')] + assert_frame_equal(result,expected) + + index = date_range(start='2001-5-4', periods=28) + df = DataFrame( + [{'REST_KEY': 1, 'DLY_TRN_QT': 80, 'DLY_SLS_AMT': 90, + 'COOP_DLY_TRN_QT': 30, 'COOP_DLY_SLS_AMT': 20}] * 28 + + [{'REST_KEY': 2, 'DLY_TRN_QT': 70, 'DLY_SLS_AMT': 10, + 'COOP_DLY_TRN_QT': 50, 'COOP_DLY_SLS_AMT': 20}] * 28, + index=index.append(index)).sort() + + index = date_range('2001-5-4',periods=4,freq='7D') + expected = DataFrame( + [{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14, + 'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4, + index=index).unstack().swaplevel(1,0).sortlevel() + result = df.resample('7D', how='count') + assert_series_equal(result,expected) + + expected = DataFrame( + [{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700, + 'COOP_DLY_TRN_QT': 560, 'COOP_DLY_SLS_AMT': 280}] * 4, + index=index) + result = df.resample('7D', how='sum') + assert_frame_equal(result,expected) class TestTimeGrouper(tm.TestCase):