From 2bf88398d9f5d3c5f24b2bd6b5b71402d6a496ba Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 25 May 2014 16:19:11 +0900 Subject: [PATCH] BUG: resample raises ValueError when NaT is included --- doc/source/v0.14.1.txt | 1 + pandas/core/groupby.py | 15 +++- pandas/lib.pyx | 11 ++- pandas/tseries/resample.py | 9 +- pandas/tseries/tests/test_resample.py | 114 ++++++++++++++++++++++---- 5 files changed, 127 insertions(+), 23 deletions(-) diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index cfdef3adb1f34..1f445173d569c 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -225,6 +225,7 @@ Bug Fixes +- BUG in ``resample`` raises ``ValueError`` when target contains ``NaT`` (:issue:`7227`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c50df6f9bb08f..a90f00fd11e36 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -28,6 +28,7 @@ from pandas import _np_version_under1p7 import pandas.lib as lib from pandas.lib import Timestamp +import pandas.tslib as tslib import pandas.algos as _algos import pandas.hashtable as _hash @@ -1581,7 +1582,11 @@ def groups(self): # this is mainly for compat # GH 3881 - return dict(zip(self.binlabels,self.bins)) + result = {} + for key, value in zip(self.binlabels, self.bins): + if key is not tslib.NaT: + result[key] = value + return result @property def nkeys(self): @@ -1605,7 +1610,8 @@ def get_iterator(self, data, axis=0): start = 0 for edge, label in zip(self.bins, self.binlabels): - yield label, slicer(start,edge) + if label is not tslib.NaT: + yield label, slicer(start,edge) start = edge if start < length: @@ -1636,7 +1642,7 @@ def indices(self): i = 0 for label, bin in zip(self.binlabels, self.bins): - if i < bin: + if label is not tslib.NaT and i < bin: indices[label] = list(range(i, bin)) i = bin return indices @@ -1647,7 +1653,8 @@ def ngroups(self): @cache_readonly def result_index(self): - return self.binlabels + mask = self.binlabels.asi8 == tslib.iNaT + return self.binlabels[~mask] @property def levels(self): diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 3324040391340..89e681e6f1c90 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -968,6 +968,10 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, int64_t l_bin, r_bin bint right_closed = closed == 'right' + mask = values == iNaT + nat_count = values[mask].size + values = values[~mask] + lenidx = len(values) lenbin = len(binner) @@ -981,7 +985,7 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, if values[lenidx-1] > binner[lenbin-1]: raise ValueError("Values falls after last bin") - bins = np.empty(lenbin - 1, dtype=np.int64) + bins = np.empty(lenbin - 1, dtype=np.int64) j = 0 # index into values bc = 0 # bin count @@ -999,6 +1003,11 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, bins[bc] = j bc += 1 + if nat_count > 0: + # shift bins by the number of NaT + bins = bins + nat_count + bins = np.insert(bins, 0, nat_count) + return bins diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 812dd5aba71e0..059a6bfd06719 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -13,6 +13,7 @@ from pandas.lib import Timestamp import pandas.lib as lib +import pandas.tslib as tslib _DEFAULT_METHOD = 'mean' @@ -186,6 +187,10 @@ def _get_time_bins(self, ax): elif not trimmed: labels = labels[:-1] + if (ax_values == tslib.iNaT).any(): + binner = binner.insert(0, tslib.NaT) + labels = labels.insert(0, tslib.NaT) + # if we end up with more labels than bins # adjust the labels # GH4076 @@ -352,14 +357,14 @@ def _get_range_edges(axis, offset, closed='left', base=0): if isinstance(offset, compat.string_types): offset = to_offset(offset) + first, last = axis.min(), axis.max() if isinstance(offset, Tick): day_nanos = _delta_to_nanoseconds(timedelta(1)) # #1165 if (day_nanos % offset.nanos) == 0: - return _adjust_dates_anchored(axis[0], axis[-1], offset, + return _adjust_dates_anchored(first, last, offset, closed=closed, base=base) - first, last = axis.min(), axis.max() if not isinstance(offset, Tick): # and first.time() != last.time(): # hack! first = tools.normalize_date(first) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index cdf62af1fd90b..db496a708adbe 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -744,28 +744,32 @@ def test_resample_consistency(self): def test_resample_timegrouper(self): # GH 7227 - dates = [datetime(2014, 10, 1), datetime(2014, 9, 3), + dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3), datetime(2014, 11, 5), datetime(2014, 9, 5), datetime(2014, 10, 8), datetime(2014, 7, 15)] - df = DataFrame(dict(A=dates, B=np.arange(len(dates)))) - result = df.set_index('A').resample('M', how='count') - exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', '2014-09-30', - '2014-10-31', '2014-11-30'], freq='M', name='A') - expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx) - assert_frame_equal(result, expected) + dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:] + dates3 = [pd.NaT] + dates1 + [pd.NaT] - result = df.groupby(pd.Grouper(freq='M', key='A')).count() - assert_frame_equal(result, expected) + for dates in [dates1, dates2, dates3]: + df = DataFrame(dict(A=dates, B=np.arange(len(dates)))) + result = df.set_index('A').resample('M', how='count') + exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', '2014-09-30', + '2014-10-31', '2014-11-30'], freq='M', name='A') + expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx) + assert_frame_equal(result, expected) - df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates)))) - result = df.set_index('A').resample('M', how='count') - expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]}, - index=exp_idx, columns=['B', 'C']) - assert_frame_equal(result, expected) + result = df.groupby(pd.Grouper(freq='M', key='A')).count() + assert_frame_equal(result, expected) - result = df.groupby(pd.Grouper(freq='M', key='A')).count() - assert_frame_equal(result, expected) + df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates)))) + result = df.set_index('A').resample('M', how='count') + expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]}, + index=exp_idx, columns=['B', 'C']) + assert_frame_equal(result, expected) + + result = df.groupby(pd.Grouper(freq='M', key='A')).count() + assert_frame_equal(result, expected) def _simple_ts(start, end, freq='D'): @@ -1302,6 +1306,84 @@ def test_fails_on_no_datetime_index(self): "but got an instance of %r" % name): df.groupby(TimeGrouper('D')) + def test_aggregate_normal(self): + # check TimeGrouper's aggregation is identical as normal groupby + + n = 20 + data = np.random.randn(n, 4) + normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + normal_df['key'] = [1, 2, 3, 4, 5] * 4 + + dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), + datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 + + normal_grouped = normal_df.groupby('key') + dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) + + for func in ['min', 'max', 'prod', 'var', 'std', 'mean']: + expected = getattr(normal_grouped, func)() + dt_result = getattr(dt_grouped, func)() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + assert_frame_equal(expected, dt_result) + + for func in ['count', 'sum']: + expected = getattr(normal_grouped, func)() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)() + assert_frame_equal(expected, dt_result) + + """ + for func in ['first', 'last']: + expected = getattr(normal_grouped, func)() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)() + assert_frame_equal(expected, dt_result) + + for func in ['nth']: + expected = getattr(normal_grouped, func)(3) + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)(3) + assert_frame_equal(expected, dt_result) + """ + # if TimeGrouper is used included, 'size' 'first','last' and 'nth' doesn't work yet + + def test_aggregate_with_nat(self): + # check TimeGrouper's aggregation is identical as normal groupby + + n = 20 + data = np.random.randn(n, 4) + normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + normal_df['key'] = [1, 2, np.nan, 4, 5] * 4 + + dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, + datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 + + normal_grouped = normal_df.groupby('key') + dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) + + for func in ['min', 'max', 'prod']: + normal_result = getattr(normal_grouped, func)() + dt_result = getattr(dt_grouped, func)() + pad = DataFrame([[np.nan, np.nan, np.nan, np.nan]], + index=[3], columns=['A', 'B', 'C', 'D']) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + assert_frame_equal(expected, dt_result) + + for func in ['count', 'sum']: + normal_result = getattr(normal_grouped, func)() + pad = DataFrame([[0, 0, 0, 0]], index=[3], columns=['A', 'B', 'C', 'D']) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)() + assert_frame_equal(expected, dt_result) + + # if NaT is included, 'var', 'std', 'mean', 'size', 'first','last' and 'nth' doesn't work yet + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],