Skip to content

Commit 2bf8839

Browse files
committed
BUG: resample raises ValueError when NaT is included
1 parent 9015e43 commit 2bf8839

File tree

5 files changed

+127
-23
lines changed

5 files changed

+127
-23
lines changed

doc/source/v0.14.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ Bug Fixes
225225

226226

227227

228+
- BUG in ``resample`` raises ``ValueError`` when target contains ``NaT`` (:issue:`7227`)
228229

229230

230231

pandas/core/groupby.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from pandas import _np_version_under1p7
2929
import pandas.lib as lib
3030
from pandas.lib import Timestamp
31+
import pandas.tslib as tslib
3132
import pandas.algos as _algos
3233
import pandas.hashtable as _hash
3334

@@ -1581,7 +1582,11 @@ def groups(self):
15811582

15821583
# this is mainly for compat
15831584
# GH 3881
1584-
return dict(zip(self.binlabels,self.bins))
1585+
result = {}
1586+
for key, value in zip(self.binlabels, self.bins):
1587+
if key is not tslib.NaT:
1588+
result[key] = value
1589+
return result
15851590

15861591
@property
15871592
def nkeys(self):
@@ -1605,7 +1610,8 @@ def get_iterator(self, data, axis=0):
16051610

16061611
start = 0
16071612
for edge, label in zip(self.bins, self.binlabels):
1608-
yield label, slicer(start,edge)
1613+
if label is not tslib.NaT:
1614+
yield label, slicer(start,edge)
16091615
start = edge
16101616

16111617
if start < length:
@@ -1636,7 +1642,7 @@ def indices(self):
16361642

16371643
i = 0
16381644
for label, bin in zip(self.binlabels, self.bins):
1639-
if i < bin:
1645+
if label is not tslib.NaT and i < bin:
16401646
indices[label] = list(range(i, bin))
16411647
i = bin
16421648
return indices
@@ -1647,7 +1653,8 @@ def ngroups(self):
16471653

16481654
@cache_readonly
16491655
def result_index(self):
1650-
return self.binlabels
1656+
mask = self.binlabels.asi8 == tslib.iNaT
1657+
return self.binlabels[~mask]
16511658

16521659
@property
16531660
def levels(self):

pandas/lib.pyx

+10-1
Original file line numberDiff line numberDiff line change
@@ -968,6 +968,10 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
968968
int64_t l_bin, r_bin
969969
bint right_closed = closed == 'right'
970970

971+
mask = values == iNaT
972+
nat_count = values[mask].size
973+
values = values[~mask]
974+
971975
lenidx = len(values)
972976
lenbin = len(binner)
973977

@@ -981,7 +985,7 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
981985
if values[lenidx-1] > binner[lenbin-1]:
982986
raise ValueError("Values falls after last bin")
983987

984-
bins = np.empty(lenbin - 1, dtype=np.int64)
988+
bins = np.empty(lenbin - 1, dtype=np.int64)
985989

986990
j = 0 # index into values
987991
bc = 0 # bin count
@@ -999,6 +1003,11 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
9991003
bins[bc] = j
10001004
bc += 1
10011005

1006+
if nat_count > 0:
1007+
# shift bins by the number of NaT
1008+
bins = bins + nat_count
1009+
bins = np.insert(bins, 0, nat_count)
1010+
10021011
return bins
10031012

10041013

pandas/tseries/resample.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from pandas.lib import Timestamp
1515
import pandas.lib as lib
16+
import pandas.tslib as tslib
1617

1718

1819
_DEFAULT_METHOD = 'mean'
@@ -186,6 +187,10 @@ def _get_time_bins(self, ax):
186187
elif not trimmed:
187188
labels = labels[:-1]
188189

190+
if (ax_values == tslib.iNaT).any():
191+
binner = binner.insert(0, tslib.NaT)
192+
labels = labels.insert(0, tslib.NaT)
193+
189194
# if we end up with more labels than bins
190195
# adjust the labels
191196
# GH4076
@@ -352,14 +357,14 @@ def _get_range_edges(axis, offset, closed='left', base=0):
352357
if isinstance(offset, compat.string_types):
353358
offset = to_offset(offset)
354359

360+
first, last = axis.min(), axis.max()
355361
if isinstance(offset, Tick):
356362
day_nanos = _delta_to_nanoseconds(timedelta(1))
357363
# #1165
358364
if (day_nanos % offset.nanos) == 0:
359-
return _adjust_dates_anchored(axis[0], axis[-1], offset,
365+
return _adjust_dates_anchored(first, last, offset,
360366
closed=closed, base=base)
361367

362-
first, last = axis.min(), axis.max()
363368
if not isinstance(offset, Tick): # and first.time() != last.time():
364369
# hack!
365370
first = tools.normalize_date(first)

pandas/tseries/tests/test_resample.py

+98-16
Original file line numberDiff line numberDiff line change
@@ -744,28 +744,32 @@ def test_resample_consistency(self):
744744

745745
def test_resample_timegrouper(self):
746746
# GH 7227
747-
dates = [datetime(2014, 10, 1), datetime(2014, 9, 3),
747+
dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3),
748748
datetime(2014, 11, 5), datetime(2014, 9, 5),
749749
datetime(2014, 10, 8), datetime(2014, 7, 15)]
750750

751-
df = DataFrame(dict(A=dates, B=np.arange(len(dates))))
752-
result = df.set_index('A').resample('M', how='count')
753-
exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', '2014-09-30',
754-
'2014-10-31', '2014-11-30'], freq='M', name='A')
755-
expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx)
756-
assert_frame_equal(result, expected)
751+
dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:]
752+
dates3 = [pd.NaT] + dates1 + [pd.NaT]
757753

758-
result = df.groupby(pd.Grouper(freq='M', key='A')).count()
759-
assert_frame_equal(result, expected)
754+
for dates in [dates1, dates2, dates3]:
755+
df = DataFrame(dict(A=dates, B=np.arange(len(dates))))
756+
result = df.set_index('A').resample('M', how='count')
757+
exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', '2014-09-30',
758+
'2014-10-31', '2014-11-30'], freq='M', name='A')
759+
expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx)
760+
assert_frame_equal(result, expected)
760761

761-
df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates))))
762-
result = df.set_index('A').resample('M', how='count')
763-
expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]},
764-
index=exp_idx, columns=['B', 'C'])
765-
assert_frame_equal(result, expected)
762+
result = df.groupby(pd.Grouper(freq='M', key='A')).count()
763+
assert_frame_equal(result, expected)
766764

767-
result = df.groupby(pd.Grouper(freq='M', key='A')).count()
768-
assert_frame_equal(result, expected)
765+
df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates))))
766+
result = df.set_index('A').resample('M', how='count')
767+
expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]},
768+
index=exp_idx, columns=['B', 'C'])
769+
assert_frame_equal(result, expected)
770+
771+
result = df.groupby(pd.Grouper(freq='M', key='A')).count()
772+
assert_frame_equal(result, expected)
769773

770774

771775
def _simple_ts(start, end, freq='D'):
@@ -1302,6 +1306,84 @@ def test_fails_on_no_datetime_index(self):
13021306
"but got an instance of %r" % name):
13031307
df.groupby(TimeGrouper('D'))
13041308

1309+
def test_aggregate_normal(self):
1310+
# check TimeGrouper's aggregation is identical as normal groupby
1311+
1312+
n = 20
1313+
data = np.random.randn(n, 4)
1314+
normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
1315+
normal_df['key'] = [1, 2, 3, 4, 5] * 4
1316+
1317+
dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
1318+
dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3),
1319+
datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4
1320+
1321+
normal_grouped = normal_df.groupby('key')
1322+
dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))
1323+
1324+
for func in ['min', 'max', 'prod', 'var', 'std', 'mean']:
1325+
expected = getattr(normal_grouped, func)()
1326+
dt_result = getattr(dt_grouped, func)()
1327+
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
1328+
assert_frame_equal(expected, dt_result)
1329+
1330+
for func in ['count', 'sum']:
1331+
expected = getattr(normal_grouped, func)()
1332+
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
1333+
dt_result = getattr(dt_grouped, func)()
1334+
assert_frame_equal(expected, dt_result)
1335+
1336+
"""
1337+
for func in ['first', 'last']:
1338+
expected = getattr(normal_grouped, func)()
1339+
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
1340+
dt_result = getattr(dt_grouped, func)()
1341+
assert_frame_equal(expected, dt_result)
1342+
1343+
for func in ['nth']:
1344+
expected = getattr(normal_grouped, func)(3)
1345+
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
1346+
dt_result = getattr(dt_grouped, func)(3)
1347+
assert_frame_equal(expected, dt_result)
1348+
"""
1349+
# if TimeGrouper is used included, 'size' 'first','last' and 'nth' doesn't work yet
1350+
1351+
def test_aggregate_with_nat(self):
1352+
# check TimeGrouper's aggregation is identical as normal groupby
1353+
1354+
n = 20
1355+
data = np.random.randn(n, 4)
1356+
normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
1357+
normal_df['key'] = [1, 2, np.nan, 4, 5] * 4
1358+
1359+
dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
1360+
dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT,
1361+
datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4
1362+
1363+
normal_grouped = normal_df.groupby('key')
1364+
dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))
1365+
1366+
for func in ['min', 'max', 'prod']:
1367+
normal_result = getattr(normal_grouped, func)()
1368+
dt_result = getattr(dt_grouped, func)()
1369+
pad = DataFrame([[np.nan, np.nan, np.nan, np.nan]],
1370+
index=[3], columns=['A', 'B', 'C', 'D'])
1371+
expected = normal_result.append(pad)
1372+
expected = expected.sort_index()
1373+
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
1374+
assert_frame_equal(expected, dt_result)
1375+
1376+
for func in ['count', 'sum']:
1377+
normal_result = getattr(normal_grouped, func)()
1378+
pad = DataFrame([[0, 0, 0, 0]], index=[3], columns=['A', 'B', 'C', 'D'])
1379+
expected = normal_result.append(pad)
1380+
expected = expected.sort_index()
1381+
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
1382+
dt_result = getattr(dt_grouped, func)()
1383+
assert_frame_equal(expected, dt_result)
1384+
1385+
# if NaT is included, 'var', 'std', 'mean', 'size', 'first','last' and 'nth' doesn't work yet
1386+
13051387

13061388
if __name__ == '__main__':
13071389
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)