Skip to content

Commit 6d6f392

Browse files
committed
BUG: Fix bound checking for Timestamp() with dt64 #4065
To fix the bug, this change adds bounds checking to _get_datetime64_nanos() for numpy datetimes that aren't already in [ns] units. Additionally, it updates _check_dts_bounds() to do the bound check just based off the pandas_datetimestruct, by comparing to the minimum and maximum valid pandas_datetimestructs for datetime64[ns]. It is simpler and more accurate than the previous system. Also includes a number of small refactors/fixes to deal with new error cases that didn't exist when invalid datetime64s were just silently coerced into the valid datetime64[ns] range.
1 parent 3722487 commit 6d6f392

File tree

9 files changed

+340
-69
lines changed

9 files changed

+340
-69
lines changed

doc/source/release.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,7 @@ Bug Fixes
555555
type of headers (:issue:`5048`).
556556
- Fixed a bug where ``DatetimeIndex`` joins with ``PeriodIndex`` caused a
557557
stack overflow (:issue:`3899`).
558-
558+
- Fix bound checking for Timestamp() with datetime64 input (:issue:`4065`)
559559

560560
pandas 0.12.0
561561
-------------

pandas/core/common.py

+15
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,13 @@ def _pickle_array(arr):
348348

349349
def _unpickle_array(bytes):
350350
arr = read_array(BytesIO(bytes))
351+
352+
# All datetimes should be stored as M8[ns]. When unpickling with
353+
# numpy1.6, it will read these as M8[us]. So this ensures all
354+
# datetime64 types are read as MS[ns]
355+
if is_datetime64_dtype(arr):
356+
arr = arr.view(_NS_DTYPE)
357+
351358
return arr
352359

353360

@@ -1780,6 +1787,14 @@ def is_datetime64_dtype(arr_or_dtype):
17801787
tipo = arr_or_dtype.dtype.type
17811788
return issubclass(tipo, np.datetime64)
17821789

1790+
def is_datetime64_ns_dtype(arr_or_dtype):
1791+
if isinstance(arr_or_dtype, np.dtype):
1792+
tipo = arr_or_dtype
1793+
elif isinstance(arr_or_dtype, type):
1794+
tipo = np.dtype(arr_or_dtype)
1795+
else:
1796+
tipo = arr_or_dtype.dtype
1797+
return tipo == _NS_DTYPE
17831798

17841799
def is_timedelta64_dtype(arr_or_dtype):
17851800
if isinstance(arr_or_dtype, np.dtype):

pandas/src/datetime.pxd

+3
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ cdef extern from "datetime/np_datetime.h":
8585
npy_int64 year
8686
npy_int32 month, day, hour, min, sec, us, ps, as
8787

88+
int cmp_pandas_datetimestruct(pandas_datetimestruct *a,
89+
pandas_datetimestruct *b)
90+
8891
int convert_pydatetime_to_datetimestruct(PyObject *obj,
8992
pandas_datetimestruct *out,
9093
PANDAS_DATETIMEUNIT *out_bestunit,

pandas/src/datetime/np_datetime.c

+63
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,69 @@ set_datetimestruct_days(npy_int64 days, pandas_datetimestruct *dts)
273273
}
274274
}
275275

276+
/*
277+
* Compares two pandas_datetimestruct objects chronologically
278+
*/
279+
int
280+
cmp_pandas_datetimestruct(pandas_datetimestruct *a, pandas_datetimestruct *b)
281+
{
282+
if (a->year > b->year) {
283+
return 1;
284+
} else if (a->year < b->year) {
285+
return -1;
286+
}
287+
288+
if (a->month > b->month) {
289+
return 1;
290+
} else if (a->month < b->month) {
291+
return -1;
292+
}
293+
294+
if (a->day > b->day) {
295+
return 1;
296+
} else if (a->day < b->day) {
297+
return -1;
298+
}
299+
300+
if (a->hour > b->hour) {
301+
return 1;
302+
} else if (a->hour < b->hour) {
303+
return -1;
304+
}
305+
306+
if (a->min > b->min) {
307+
return 1;
308+
} else if (a->min < b->min) {
309+
return -1;
310+
}
311+
312+
if (a->sec > b->sec) {
313+
return 1;
314+
} else if (a->sec < b->sec) {
315+
return -1;
316+
}
317+
318+
if (a->us > b->us) {
319+
return 1;
320+
} else if (a->us < b->us) {
321+
return -1;
322+
}
323+
324+
if (a->ps > b->ps) {
325+
return 1;
326+
} else if (a->ps < b->ps) {
327+
return -1;
328+
}
329+
330+
if (a->as > b->as) {
331+
return 1;
332+
} else if (a->as < b->as) {
333+
return -1;
334+
}
335+
336+
return 0;
337+
}
338+
276339
/*
277340
*
278341
* Tests for and converts a Python datetime.datetime or datetime.date

pandas/tseries/index.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def __new__(cls, data=None,
204204
data = _str_to_dt_array(data, offset, dayfirst=dayfirst,
205205
yearfirst=yearfirst)
206206
else:
207-
data = tools.to_datetime(data)
207+
data = tools.to_datetime(data, errors='raise')
208208
data.offset = offset
209209
if isinstance(data, DatetimeIndex):
210210
if name is not None:
@@ -243,14 +243,14 @@ def __new__(cls, data=None,
243243
subarr = data.view(_NS_DTYPE)
244244
else:
245245
try:
246-
subarr = tools.to_datetime(data)
246+
subarr = tools.to_datetime(data, box=False)
247247
except ValueError:
248248
# tz aware
249-
subarr = tools.to_datetime(data, utc=True)
249+
subarr = tools.to_datetime(data, box=False, utc=True)
250250

251251
if not np.issubdtype(subarr.dtype, np.datetime64):
252-
raise TypeError('Unable to convert %s to datetime dtype'
253-
% str(data))
252+
raise ValueError('Unable to convert %s to datetime dtype'
253+
% str(data))
254254

255255
if isinstance(subarr, DatetimeIndex):
256256
if tz is None:
@@ -934,7 +934,7 @@ def join(self, other, how='left', level=None, return_indexers=False):
934934
'mixed-integer-float', 'mixed')):
935935
try:
936936
other = DatetimeIndex(other)
937-
except TypeError:
937+
except (TypeError, ValueError):
938938
pass
939939

940940
this, other = self._maybe_utc_convert(other)
@@ -1051,7 +1051,7 @@ def intersection(self, other):
10511051
if not isinstance(other, DatetimeIndex):
10521052
try:
10531053
other = DatetimeIndex(other)
1054-
except TypeError:
1054+
except (TypeError, ValueError):
10551055
pass
10561056
result = Index.intersection(self, other)
10571057
if isinstance(result, DatetimeIndex):

pandas/tseries/tests/test_timeseries.py

+76-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# pylint: disable-msg=E1101,W0612
2-
from datetime import datetime, time, timedelta
2+
from datetime import datetime, time, timedelta, date
33
import sys
44
import os
55
import unittest
@@ -952,6 +952,81 @@ def test_to_datetime_list_of_integers(self):
952952

953953
self.assert_(rng.equals(result))
954954

955+
def test_to_datetime_dt64s(self):
956+
in_bound_dts = [
957+
np.datetime64('2000-01-01'),
958+
np.datetime64('2000-01-02'),
959+
]
960+
961+
for dt in in_bound_dts:
962+
self.assertEqual(
963+
pd.to_datetime(dt),
964+
Timestamp(dt)
965+
)
966+
967+
oob_dts = [
968+
np.datetime64('1000-01-01'),
969+
np.datetime64('5000-01-02'),
970+
]
971+
972+
for dt in oob_dts:
973+
self.assertRaises(ValueError, pd.to_datetime, dt, errors='raise')
974+
self.assertRaises(ValueError, tslib.Timestamp, dt)
975+
self.assert_(pd.to_datetime(dt, coerce=True) is NaT)
976+
977+
def test_to_datetime_array_of_dt64s(self):
978+
dts = [
979+
np.datetime64('2000-01-01'),
980+
np.datetime64('2000-01-02'),
981+
]
982+
983+
# Assuming all datetimes are in bounds, to_datetime() returns
984+
# an array that is equal to Timestamp() parsing
985+
self.assert_(
986+
np.array_equal(
987+
pd.to_datetime(dts, box=False),
988+
np.array([Timestamp(x).asm8 for x in dts])
989+
)
990+
)
991+
992+
# A list of datetimes where the last one is out of bounds
993+
dts_with_oob = dts + [np.datetime64('9999-01-01')]
994+
995+
self.assertRaises(
996+
ValueError,
997+
pd.to_datetime,
998+
dts_with_oob,
999+
coerce=False,
1000+
errors='raise'
1001+
)
1002+
1003+
self.assert_(
1004+
np.array_equal(
1005+
pd.to_datetime(dts_with_oob, box=False, coerce=True),
1006+
np.array(
1007+
[
1008+
Timestamp(dts_with_oob[0]).asm8,
1009+
Timestamp(dts_with_oob[1]).asm8,
1010+
iNaT,
1011+
],
1012+
dtype='M8'
1013+
)
1014+
)
1015+
)
1016+
1017+
# With coerce=False and errors='ignore', out of bounds datetime64s
1018+
# are converted to their .item(), which depending on the version of
1019+
# numpy is either a python datetime.datetime or datetime.date
1020+
self.assert_(
1021+
np.array_equal(
1022+
pd.to_datetime(dts_with_oob, box=False, coerce=False),
1023+
np.array(
1024+
[dt.item() for dt in dts_with_oob],
1025+
dtype='O'
1026+
)
1027+
)
1028+
)
1029+
9551030
def test_index_to_datetime(self):
9561031
idx = Index(['1/1/2000', '1/2/2000', '1/3/2000'])
9571032

pandas/tseries/tests/test_tslib.py

+86-22
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55

66
from pandas import tslib
7-
from datetime import datetime
7+
import datetime
88

99
from pandas.core.api import Timestamp
1010

@@ -15,19 +15,53 @@
1515
from pandas import _np_version_under1p7
1616

1717

18-
class TestDatetimeParsingWrappers(unittest.TestCase):
19-
def test_verify_datetime_bounds(self):
20-
for year in (1, 1000, 1677, 2262, 5000):
21-
dt = datetime(year, 1, 1)
22-
self.assertRaises(
23-
ValueError,
24-
tslib.verify_datetime_bounds,
25-
dt
26-
)
18+
class TestTimestamp(unittest.TestCase):
19+
def test_bounds_with_different_units(self):
20+
out_of_bounds_dates = (
21+
'1677-09-21',
22+
'2262-04-12',
23+
)
24+
25+
time_units = ('D', 'h', 'm', 's', 'ms', 'us')
2726

28-
for year in (1678, 2000, 2261):
29-
tslib.verify_datetime_bounds(datetime(year, 1, 1))
27+
for date_string in out_of_bounds_dates:
28+
for unit in time_units:
29+
self.assertRaises(
30+
ValueError,
31+
tslib.Timestamp,
32+
np.datetime64(date_string, dtype='M8[%s]' % unit)
33+
)
34+
35+
in_bounds_dates = (
36+
'1677-09-23',
37+
'2262-04-11',
38+
)
3039

40+
for date_string in in_bounds_dates:
41+
for unit in time_units:
42+
tslib.Timestamp(
43+
np.datetime64(date_string, dtype='M8[%s]' % unit)
44+
)
45+
46+
def test_barely_oob_dts(self):
47+
one_us = np.timedelta64(1)
48+
49+
# By definition we can't go out of bounds in [ns], so we
50+
# convert the datetime64s to [us] so we can go out of bounds
51+
min_ts_us = np.datetime64(tslib.Timestamp.min).astype('M8[us]')
52+
max_ts_us = np.datetime64(tslib.Timestamp.max).astype('M8[us]')
53+
54+
# No error for the min/max datetimes
55+
tslib.Timestamp(min_ts_us)
56+
tslib.Timestamp(max_ts_us)
57+
58+
# One us less than the minimum is an error
59+
self.assertRaises(ValueError, tslib.Timestamp, min_ts_us - one_us)
60+
61+
# One us more than the maximum is an error
62+
self.assertRaises(ValueError, tslib.Timestamp, max_ts_us + one_us)
63+
64+
class TestDatetimeParsingWrappers(unittest.TestCase):
3165
def test_does_not_convert_mixed_integer(self):
3266
bad_date_strings = (
3367
'-50000',
@@ -97,15 +131,45 @@ def test_number_looking_strings_not_into_datetime(self):
97131
arr = np.array(['1', '2', '3', '4', '5'], dtype=object)
98132
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))
99133

100-
def test_dates_outside_of_datetime64_ns_bounds(self):
101-
# These datetimes are outside of the bounds of the
102-
# datetime64[ns] bounds, so they cannot be converted to
103-
# datetimes
104-
arr = np.array(['1/1/1676', '1/2/1676'], dtype=object)
105-
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))
134+
def test_coercing_dates_outside_of_datetime64_ns_bounds(self):
135+
invalid_dates = [
136+
datetime.date(1000, 1, 1),
137+
datetime.datetime(1000, 1, 1),
138+
'1000-01-01',
139+
'Jan 1, 1000',
140+
np.datetime64('1000-01-01'),
141+
]
106142

107-
arr = np.array(['1/1/2263', '1/2/2263'], dtype=object)
108-
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))
143+
for invalid_date in invalid_dates:
144+
self.assertRaises(
145+
ValueError,
146+
tslib.array_to_datetime,
147+
np.array([invalid_date], dtype='object'),
148+
coerce=False,
149+
raise_=True,
150+
)
151+
self.assert_(
152+
np.array_equal(
153+
tslib.array_to_datetime(
154+
np.array([invalid_date], dtype='object'), coerce=True
155+
),
156+
np.array([tslib.iNaT], dtype='M8[ns]')
157+
)
158+
)
159+
160+
arr = np.array(['1/1/1000', '1/1/2000'], dtype=object)
161+
self.assert_(
162+
np.array_equal(
163+
tslib.array_to_datetime(arr, coerce=True),
164+
np.array(
165+
[
166+
tslib.iNaT,
167+
'2000-01-01T00:00:00.000000000-0000'
168+
],
169+
dtype='M8[ns]'
170+
)
171+
)
172+
)
109173

110174
def test_coerce_of_invalid_datetimes(self):
111175
arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object)
@@ -130,11 +194,11 @@ def test_coerce_of_invalid_datetimes(self):
130194
)
131195

132196

133-
class TestTimestamp(unittest.TestCase):
197+
class TestTimestampNsOperations(unittest.TestCase):
134198
def setUp(self):
135199
if _np_version_under1p7:
136200
raise nose.SkipTest('numpy >= 1.7 required')
137-
self.timestamp = Timestamp(datetime.utcnow())
201+
self.timestamp = Timestamp(datetime.datetime.utcnow())
138202

139203
def assert_ns_timedelta(self, modified_timestamp, expected_value):
140204
value = self.timestamp.value

0 commit comments

Comments
 (0)