Skip to content

BUG: resample raises ValueError when NaT is included #7373

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 14, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/v0.14.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ Bug Fixes



- BUG in ``resample`` raises ``ValueError`` when target contains ``NaT`` (:issue:`7227`)



Expand Down
15 changes: 11 additions & 4 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from pandas import _np_version_under1p7
import pandas.lib as lib
from pandas.lib import Timestamp
import pandas.tslib as tslib
import pandas.algos as _algos
import pandas.hashtable as _hash

Expand Down Expand Up @@ -1581,7 +1582,11 @@ def groups(self):

# this is mainly for compat
# GH 3881
return dict(zip(self.binlabels,self.bins))
result = {}
for key, value in zip(self.binlabels, self.bins):
if key is not tslib.NaT:
result[key] = value
return result

@property
def nkeys(self):
Expand All @@ -1605,7 +1610,8 @@ def get_iterator(self, data, axis=0):

start = 0
for edge, label in zip(self.bins, self.binlabels):
yield label, slicer(start,edge)
if label is not tslib.NaT:
yield label, slicer(start,edge)
start = edge

if start < length:
Expand Down Expand Up @@ -1636,7 +1642,7 @@ def indices(self):

i = 0
for label, bin in zip(self.binlabels, self.bins):
if i < bin:
if label is not tslib.NaT and i < bin:
indices[label] = list(range(i, bin))
i = bin
return indices
Expand All @@ -1647,7 +1653,8 @@ def ngroups(self):

@cache_readonly
def result_index(self):
return self.binlabels
mask = self.binlabels.asi8 == tslib.iNaT
return self.binlabels[~mask]

@property
def levels(self):
Expand Down
11 changes: 10 additions & 1 deletion pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,10 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
int64_t l_bin, r_bin
bint right_closed = closed == 'right'

mask = values == iNaT
nat_count = values[mask].size
values = values[~mask]

lenidx = len(values)
lenbin = len(binner)

Expand All @@ -981,7 +985,7 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
if values[lenidx-1] > binner[lenbin-1]:
raise ValueError("Values falls after last bin")

bins = np.empty(lenbin - 1, dtype=np.int64)
bins = np.empty(lenbin - 1, dtype=np.int64)

j = 0 # index into values
bc = 0 # bin count
Expand All @@ -999,6 +1003,11 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
bins[bc] = j
bc += 1

if nat_count > 0:
# shift bins by the number of NaT
bins = bins + nat_count
bins = np.insert(bins, 0, nat_count)

return bins


Expand Down
9 changes: 7 additions & 2 deletions pandas/tseries/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from pandas.lib import Timestamp
import pandas.lib as lib
import pandas.tslib as tslib


_DEFAULT_METHOD = 'mean'
Expand Down Expand Up @@ -186,6 +187,10 @@ def _get_time_bins(self, ax):
elif not trimmed:
labels = labels[:-1]

if (ax_values == tslib.iNaT).any():
binner = binner.insert(0, tslib.NaT)
labels = labels.insert(0, tslib.NaT)

# if we end up with more labels than bins
# adjust the labels
# GH4076
Expand Down Expand Up @@ -352,14 +357,14 @@ def _get_range_edges(axis, offset, closed='left', base=0):
if isinstance(offset, compat.string_types):
offset = to_offset(offset)

first, last = axis.min(), axis.max()
if isinstance(offset, Tick):
day_nanos = _delta_to_nanoseconds(timedelta(1))
# #1165
if (day_nanos % offset.nanos) == 0:
return _adjust_dates_anchored(axis[0], axis[-1], offset,
return _adjust_dates_anchored(first, last, offset,
closed=closed, base=base)

first, last = axis.min(), axis.max()
if not isinstance(offset, Tick): # and first.time() != last.time():
# hack!
first = tools.normalize_date(first)
Expand Down
114 changes: 98 additions & 16 deletions pandas/tseries/tests/test_resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,28 +744,32 @@ def test_resample_consistency(self):

def test_resample_timegrouper(self):
# GH 7227
dates = [datetime(2014, 10, 1), datetime(2014, 9, 3),
dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3),
datetime(2014, 11, 5), datetime(2014, 9, 5),
datetime(2014, 10, 8), datetime(2014, 7, 15)]

df = DataFrame(dict(A=dates, B=np.arange(len(dates))))
result = df.set_index('A').resample('M', how='count')
exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', '2014-09-30',
'2014-10-31', '2014-11-30'], freq='M', name='A')
expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx)
assert_frame_equal(result, expected)
dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:]
dates3 = [pd.NaT] + dates1 + [pd.NaT]

result = df.groupby(pd.Grouper(freq='M', key='A')).count()
assert_frame_equal(result, expected)
for dates in [dates1, dates2, dates3]:
df = DataFrame(dict(A=dates, B=np.arange(len(dates))))
result = df.set_index('A').resample('M', how='count')
exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', '2014-09-30',
'2014-10-31', '2014-11-30'], freq='M', name='A')
expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx)
assert_frame_equal(result, expected)

df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates))))
result = df.set_index('A').resample('M', how='count')
expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]},
index=exp_idx, columns=['B', 'C'])
assert_frame_equal(result, expected)
result = df.groupby(pd.Grouper(freq='M', key='A')).count()
assert_frame_equal(result, expected)

result = df.groupby(pd.Grouper(freq='M', key='A')).count()
assert_frame_equal(result, expected)
df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates))))
result = df.set_index('A').resample('M', how='count')
expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]},
index=exp_idx, columns=['B', 'C'])
assert_frame_equal(result, expected)

result = df.groupby(pd.Grouper(freq='M', key='A')).count()
assert_frame_equal(result, expected)


def _simple_ts(start, end, freq='D'):
Expand Down Expand Up @@ -1302,6 +1306,84 @@ def test_fails_on_no_datetime_index(self):
"but got an instance of %r" % name):
df.groupby(TimeGrouper('D'))

def test_aggregate_normal(self):
# check TimeGrouper's aggregation is identical as normal groupby

n = 20
data = np.random.randn(n, 4)
normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
normal_df['key'] = [1, 2, 3, 4, 5] * 4

dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3),
datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4

normal_grouped = normal_df.groupby('key')
dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))

for func in ['min', 'max', 'prod', 'var', 'std', 'mean']:
expected = getattr(normal_grouped, func)()
dt_result = getattr(dt_grouped, func)()
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
assert_frame_equal(expected, dt_result)

for func in ['count', 'sum']:
expected = getattr(normal_grouped, func)()
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
dt_result = getattr(dt_grouped, func)()
assert_frame_equal(expected, dt_result)

"""
for func in ['first', 'last']:
expected = getattr(normal_grouped, func)()
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
dt_result = getattr(dt_grouped, func)()
assert_frame_equal(expected, dt_result)

for func in ['nth']:
expected = getattr(normal_grouped, func)(3)
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
dt_result = getattr(dt_grouped, func)(3)
assert_frame_equal(expected, dt_result)
"""
# if TimeGrouper is used included, 'size' 'first','last' and 'nth' doesn't work yet

def test_aggregate_with_nat(self):
# check TimeGrouper's aggregation is identical as normal groupby

n = 20
data = np.random.randn(n, 4)
normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
normal_df['key'] = [1, 2, np.nan, 4, 5] * 4

dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT,
datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4

normal_grouped = normal_df.groupby('key')
dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))

for func in ['min', 'max', 'prod']:
normal_result = getattr(normal_grouped, func)()
dt_result = getattr(dt_grouped, func)()
pad = DataFrame([[np.nan, np.nan, np.nan, np.nan]],
index=[3], columns=['A', 'B', 'C', 'D'])
expected = normal_result.append(pad)
expected = expected.sort_index()
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
assert_frame_equal(expected, dt_result)

for func in ['count', 'sum']:
normal_result = getattr(normal_grouped, func)()
pad = DataFrame([[0, 0, 0, 0]], index=[3], columns=['A', 'B', 'C', 'D'])
expected = normal_result.append(pad)
expected = expected.sort_index()
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
dt_result = getattr(dt_grouped, func)()
assert_frame_equal(expected, dt_result)

# if NaT is included, 'var', 'std', 'mean', 'size', 'first','last' and 'nth' doesn't work yet
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get var/std/mean should raise (so add a test for these). the rest should work though, no?



if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down