diff --git a/doc/source/release.rst b/doc/source/release.rst index ebba7444e82d8..3b7bd6544e569 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -555,7 +555,7 @@ Bug Fixes type of headers (:issue:`5048`). - Fixed a bug where ``DatetimeIndex`` joins with ``PeriodIndex`` caused a stack overflow (:issue:`3899`). - + - Fix bound checking for Timestamp() with datetime64 input (:issue:`4065`) pandas 0.12.0 ------------- diff --git a/pandas/core/common.py b/pandas/core/common.py index 2c5ca42c7be86..108b82eaf9056 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -348,6 +348,13 @@ def _pickle_array(arr): def _unpickle_array(bytes): arr = read_array(BytesIO(bytes)) + + # All datetimes should be stored as M8[ns]. When unpickling with + # numpy1.6, it will read these as M8[us]. So this ensures all + # datetime64 types are read as MS[ns] + if is_datetime64_dtype(arr): + arr = arr.view(_NS_DTYPE) + return arr @@ -1780,6 +1787,14 @@ def is_datetime64_dtype(arr_or_dtype): tipo = arr_or_dtype.dtype.type return issubclass(tipo, np.datetime64) +def is_datetime64_ns_dtype(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + tipo = arr_or_dtype + elif isinstance(arr_or_dtype, type): + tipo = np.dtype(arr_or_dtype) + else: + tipo = arr_or_dtype.dtype + return tipo == _NS_DTYPE def is_timedelta64_dtype(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): diff --git a/pandas/src/datetime.pxd b/pandas/src/datetime.pxd index 1a977aab48514..3e11c9d20fb0d 100644 --- a/pandas/src/datetime.pxd +++ b/pandas/src/datetime.pxd @@ -85,6 +85,9 @@ cdef extern from "datetime/np_datetime.h": npy_int64 year npy_int32 month, day, hour, min, sec, us, ps, as + int cmp_pandas_datetimestruct(pandas_datetimestruct *a, + pandas_datetimestruct *b) + int convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, PANDAS_DATETIMEUNIT *out_bestunit, diff --git a/pandas/src/datetime/np_datetime.c b/pandas/src/datetime/np_datetime.c index 527ce615917cf..c30b404d2b8b2 100644 --- a/pandas/src/datetime/np_datetime.c +++ b/pandas/src/datetime/np_datetime.c @@ -273,6 +273,69 @@ set_datetimestruct_days(npy_int64 days, pandas_datetimestruct *dts) } } +/* + * Compares two pandas_datetimestruct objects chronologically + */ +int +cmp_pandas_datetimestruct(pandas_datetimestruct *a, pandas_datetimestruct *b) +{ + if (a->year > b->year) { + return 1; + } else if (a->year < b->year) { + return -1; + } + + if (a->month > b->month) { + return 1; + } else if (a->month < b->month) { + return -1; + } + + if (a->day > b->day) { + return 1; + } else if (a->day < b->day) { + return -1; + } + + if (a->hour > b->hour) { + return 1; + } else if (a->hour < b->hour) { + return -1; + } + + if (a->min > b->min) { + return 1; + } else if (a->min < b->min) { + return -1; + } + + if (a->sec > b->sec) { + return 1; + } else if (a->sec < b->sec) { + return -1; + } + + if (a->us > b->us) { + return 1; + } else if (a->us < b->us) { + return -1; + } + + if (a->ps > b->ps) { + return 1; + } else if (a->ps < b->ps) { + return -1; + } + + if (a->as > b->as) { + return 1; + } else if (a->as < b->as) { + return -1; + } + + return 0; +} + /* * * Tests for and converts a Python datetime.datetime or datetime.date diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 281ac0cc8a35a..7d67f3b013b37 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -204,7 +204,7 @@ def __new__(cls, data=None, data = _str_to_dt_array(data, offset, dayfirst=dayfirst, yearfirst=yearfirst) else: - data = tools.to_datetime(data) + data = tools.to_datetime(data, errors='raise') data.offset = offset if isinstance(data, DatetimeIndex): if name is not None: @@ -243,14 +243,14 @@ def __new__(cls, data=None, subarr = data.view(_NS_DTYPE) else: try: - subarr = tools.to_datetime(data) + subarr = tools.to_datetime(data, box=False) except ValueError: # tz aware - subarr = tools.to_datetime(data, utc=True) + subarr = tools.to_datetime(data, box=False, utc=True) if not np.issubdtype(subarr.dtype, np.datetime64): - raise TypeError('Unable to convert %s to datetime dtype' - % str(data)) + raise ValueError('Unable to convert %s to datetime dtype' + % str(data)) if isinstance(subarr, DatetimeIndex): if tz is None: @@ -934,7 +934,7 @@ def join(self, other, how='left', level=None, return_indexers=False): 'mixed-integer-float', 'mixed')): try: other = DatetimeIndex(other) - except TypeError: + except (TypeError, ValueError): pass this, other = self._maybe_utc_convert(other) @@ -1051,7 +1051,7 @@ def intersection(self, other): if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) - except TypeError: + except (TypeError, ValueError): pass result = Index.intersection(self, other) if isinstance(result, DatetimeIndex): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 5329f37095961..cda84a99a95db 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -1,5 +1,5 @@ # pylint: disable-msg=E1101,W0612 -from datetime import datetime, time, timedelta +from datetime import datetime, time, timedelta, date import sys import os import unittest @@ -952,6 +952,81 @@ def test_to_datetime_list_of_integers(self): self.assert_(rng.equals(result)) + def test_to_datetime_dt64s(self): + in_bound_dts = [ + np.datetime64('2000-01-01'), + np.datetime64('2000-01-02'), + ] + + for dt in in_bound_dts: + self.assertEqual( + pd.to_datetime(dt), + Timestamp(dt) + ) + + oob_dts = [ + np.datetime64('1000-01-01'), + np.datetime64('5000-01-02'), + ] + + for dt in oob_dts: + self.assertRaises(ValueError, pd.to_datetime, dt, errors='raise') + self.assertRaises(ValueError, tslib.Timestamp, dt) + self.assert_(pd.to_datetime(dt, coerce=True) is NaT) + + def test_to_datetime_array_of_dt64s(self): + dts = [ + np.datetime64('2000-01-01'), + np.datetime64('2000-01-02'), + ] + + # Assuming all datetimes are in bounds, to_datetime() returns + # an array that is equal to Timestamp() parsing + self.assert_( + np.array_equal( + pd.to_datetime(dts, box=False), + np.array([Timestamp(x).asm8 for x in dts]) + ) + ) + + # A list of datetimes where the last one is out of bounds + dts_with_oob = dts + [np.datetime64('9999-01-01')] + + self.assertRaises( + ValueError, + pd.to_datetime, + dts_with_oob, + coerce=False, + errors='raise' + ) + + self.assert_( + np.array_equal( + pd.to_datetime(dts_with_oob, box=False, coerce=True), + np.array( + [ + Timestamp(dts_with_oob[0]).asm8, + Timestamp(dts_with_oob[1]).asm8, + iNaT, + ], + dtype='M8' + ) + ) + ) + + # With coerce=False and errors='ignore', out of bounds datetime64s + # are converted to their .item(), which depending on the version of + # numpy is either a python datetime.datetime or datetime.date + self.assert_( + np.array_equal( + pd.to_datetime(dts_with_oob, box=False, coerce=False), + np.array( + [dt.item() for dt in dts_with_oob], + dtype='O' + ) + ) + ) + def test_index_to_datetime(self): idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 4e7daede03085..20138cb8b1eb8 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -4,7 +4,7 @@ import numpy as np from pandas import tslib -from datetime import datetime +import datetime from pandas.core.api import Timestamp @@ -15,19 +15,53 @@ from pandas import _np_version_under1p7 -class TestDatetimeParsingWrappers(unittest.TestCase): - def test_verify_datetime_bounds(self): - for year in (1, 1000, 1677, 2262, 5000): - dt = datetime(year, 1, 1) - self.assertRaises( - ValueError, - tslib.verify_datetime_bounds, - dt - ) +class TestTimestamp(unittest.TestCase): + def test_bounds_with_different_units(self): + out_of_bounds_dates = ( + '1677-09-21', + '2262-04-12', + ) + + time_units = ('D', 'h', 'm', 's', 'ms', 'us') - for year in (1678, 2000, 2261): - tslib.verify_datetime_bounds(datetime(year, 1, 1)) + for date_string in out_of_bounds_dates: + for unit in time_units: + self.assertRaises( + ValueError, + tslib.Timestamp, + np.datetime64(date_string, dtype='M8[%s]' % unit) + ) + + in_bounds_dates = ( + '1677-09-23', + '2262-04-11', + ) + for date_string in in_bounds_dates: + for unit in time_units: + tslib.Timestamp( + np.datetime64(date_string, dtype='M8[%s]' % unit) + ) + + def test_barely_oob_dts(self): + one_us = np.timedelta64(1) + + # By definition we can't go out of bounds in [ns], so we + # convert the datetime64s to [us] so we can go out of bounds + min_ts_us = np.datetime64(tslib.Timestamp.min).astype('M8[us]') + max_ts_us = np.datetime64(tslib.Timestamp.max).astype('M8[us]') + + # No error for the min/max datetimes + tslib.Timestamp(min_ts_us) + tslib.Timestamp(max_ts_us) + + # One us less than the minimum is an error + self.assertRaises(ValueError, tslib.Timestamp, min_ts_us - one_us) + + # One us more than the maximum is an error + self.assertRaises(ValueError, tslib.Timestamp, max_ts_us + one_us) + +class TestDatetimeParsingWrappers(unittest.TestCase): def test_does_not_convert_mixed_integer(self): bad_date_strings = ( '-50000', @@ -97,15 +131,45 @@ def test_number_looking_strings_not_into_datetime(self): arr = np.array(['1', '2', '3', '4', '5'], dtype=object) self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) - def test_dates_outside_of_datetime64_ns_bounds(self): - # These datetimes are outside of the bounds of the - # datetime64[ns] bounds, so they cannot be converted to - # datetimes - arr = np.array(['1/1/1676', '1/2/1676'], dtype=object) - self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + def test_coercing_dates_outside_of_datetime64_ns_bounds(self): + invalid_dates = [ + datetime.date(1000, 1, 1), + datetime.datetime(1000, 1, 1), + '1000-01-01', + 'Jan 1, 1000', + np.datetime64('1000-01-01'), + ] - arr = np.array(['1/1/2263', '1/2/2263'], dtype=object) - self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + for invalid_date in invalid_dates: + self.assertRaises( + ValueError, + tslib.array_to_datetime, + np.array([invalid_date], dtype='object'), + coerce=False, + raise_=True, + ) + self.assert_( + np.array_equal( + tslib.array_to_datetime( + np.array([invalid_date], dtype='object'), coerce=True + ), + np.array([tslib.iNaT], dtype='M8[ns]') + ) + ) + + arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) + self.assert_( + np.array_equal( + tslib.array_to_datetime(arr, coerce=True), + np.array( + [ + tslib.iNaT, + '2000-01-01T00:00:00.000000000-0000' + ], + dtype='M8[ns]' + ) + ) + ) def test_coerce_of_invalid_datetimes(self): arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object) @@ -130,11 +194,11 @@ def test_coerce_of_invalid_datetimes(self): ) -class TestTimestamp(unittest.TestCase): +class TestTimestampNsOperations(unittest.TestCase): def setUp(self): if _np_version_under1p7: raise nose.SkipTest('numpy >= 1.7 required') - self.timestamp = Timestamp(datetime.utcnow()) + self.timestamp = Timestamp(datetime.datetime.utcnow()) def assert_ns_timedelta(self, modified_timestamp, expected_value): value = self.timestamp.value diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 39364d21d4aa1..793d9409e662e 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -89,13 +89,12 @@ def _convert_listlike(arg, box): if isinstance(arg, (list,tuple)): arg = np.array(arg, dtype='O') - if com.is_datetime64_dtype(arg): + if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) - except ValueError as e: - values, tz = tslib.datetime_to_datetime64(arg) - return DatetimeIndex._simple_new(values, None, tz=tz) + except ValueError: + pass return arg diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 5f81389f318f8..ff3284b72aecb 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -404,6 +404,11 @@ cpdef object get_value_box(ndarray arr, object loc): # wraparound behavior when using the true int64 lower boundary cdef int64_t _NS_LOWER_BOUND = -9223285636854775000LL cdef int64_t _NS_UPPER_BOUND = 9223372036854775807LL + +cdef pandas_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS +pandas_datetime_to_datetimestruct(_NS_LOWER_BOUND, PANDAS_FR_ns, &_NS_MIN_DTS) +pandas_datetime_to_datetimestruct(_NS_UPPER_BOUND, PANDAS_FR_ns, &_NS_MAX_DTS) + Timestamp.min = Timestamp(_NS_LOWER_BOUND) Timestamp.max = Timestamp(_NS_UPPER_BOUND) @@ -759,7 +764,7 @@ cdef convert_to_tsobject(object ts, object tz, object unit): if is_timestamp(ts): obj.value += ts.nanosecond - _check_dts_bounds(obj.value, &obj.dts) + _check_dts_bounds(&obj.dts) return obj elif PyDate_Check(ts): # Keep the converter same as PyDateTime's @@ -770,7 +775,7 @@ cdef convert_to_tsobject(object ts, object tz, object unit): type(ts)) if obj.value != NPY_NAT: - _check_dts_bounds(obj.value, &obj.dts) + _check_dts_bounds(&obj.dts) if tz is not None: _localize_tso(obj, tz) @@ -825,16 +830,26 @@ cdef inline object _get_zone(object tz): return tz -cdef inline _check_dts_bounds(int64_t value, pandas_datetimestruct *dts): - cdef pandas_datetimestruct dts2 - if dts.year <= 1677 or dts.year >= 2262: - pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts2) - if dts2.year != dts.year: - fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, - dts.day, dts.hour, - dts.min, dts.sec) +class OutOfBoundsDatetime(ValueError): + pass + +cdef inline _check_dts_bounds(pandas_datetimestruct *dts): + cdef: + bint error = False + + if dts.year <= 1677 and cmp_pandas_datetimestruct(dts, &_NS_MIN_DTS) == -1: + error = True + elif ( + dts.year >= 2262 and + cmp_pandas_datetimestruct(dts, &_NS_MAX_DTS) == 1): + error = True - raise ValueError('Out of bounds nanosecond timestamp: %s' % fmt) + if error: + fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, + dts.day, dts.hour, + dts.min, dts.sec) + + raise OutOfBoundsDatetime('Out of bounds nanosecond timestamp: %s' % fmt) # elif isinstance(ts, _Timestamp): # tmp = ts @@ -869,12 +884,12 @@ def datetime_to_datetime64(ndarray[object] values): _ts = convert_to_tsobject(val, None, None) iresult[i] = _ts.value - _check_dts_bounds(iresult[i], &_ts.dts) + _check_dts_bounds(&_ts.dts) else: if inferred_tz is not None: raise ValueError('Cannot mix tz-aware with tz-naive values') iresult[i] = _pydatetime_to_dts(val, &dts) - _check_dts_bounds(iresult[i], &dts) + _check_dts_bounds(&dts) else: raise TypeError('Unrecognized value type: %s' % type(val)) @@ -882,14 +897,6 @@ def datetime_to_datetime64(ndarray[object] values): _not_datelike_strings = set(['a','A','m','M','p','P','t','T']) -def verify_datetime_bounds(dt): - """Verify datetime.datetime is within the datetime64[ns] bounds.""" - if dt.year <= 1677 or dt.year >= 2262: - raise ValueError( - 'Given datetime not within valid datetime64[ns] bounds' - ) - return dt - def _does_string_look_like_datetime(date_string): if date_string.startswith('0'): # Strings starting with 0 are more consistent with a @@ -907,15 +914,11 @@ def _does_string_look_like_datetime(date_string): return True -def parse_datetime_string(date_string, verify_bounds=True, **kwargs): +def parse_datetime_string(date_string, **kwargs): if not _does_string_look_like_datetime(date_string): raise ValueError('Given date string not likely a datetime.') dt = parse_date(date_string, **kwargs) - - if verify_bounds: - verify_datetime_bounds(dt) - return dt def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, @@ -942,7 +945,13 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, if utc_convert: _ts = convert_to_tsobject(val, None, unit) iresult[i] = _ts.value - _check_dts_bounds(iresult[i], &_ts.dts) + try: + _check_dts_bounds(&_ts.dts) + except ValueError: + if coerce: + iresult[i] = iNaT + continue + raise else: raise ValueError('Tz-aware datetime.datetime cannot ' 'be converted to datetime64 unless ' @@ -951,12 +960,30 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, iresult[i] = _pydatetime_to_dts(val, &dts) if is_timestamp(val): iresult[i] += (<_Timestamp>val).nanosecond - _check_dts_bounds(iresult[i], &dts) + try: + _check_dts_bounds(&dts) + except ValueError: + if coerce: + iresult[i] = iNaT + continue + raise elif PyDate_Check(val): iresult[i] = _date_to_datetime64(val, &dts) - _check_dts_bounds(iresult[i], &dts) + try: + _check_dts_bounds(&dts) + except ValueError: + if coerce: + iresult[i] = iNaT + continue + raise elif util.is_datetime64_object(val): - iresult[i] = _get_datetime64_nanos(val) + try: + iresult[i] = _get_datetime64_nanos(val) + except ValueError: + if coerce: + iresult[i] = iNaT + continue + raise # if we are coercing, dont' allow integers elif util.is_integer_object(val) and not coerce: @@ -982,17 +1009,26 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, _string_to_dts(val, &dts) iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) - _check_dts_bounds(iresult[i], &dts) + _check_dts_bounds(&dts) except ValueError: try: - result[i] = parse_datetime_string( - val, dayfirst=dayfirst + iresult[i] = _pydatetime_to_dts( + parse_datetime_string(val, dayfirst=dayfirst), + &dts ) except Exception: if coerce: iresult[i] = iNaT continue raise TypeError + + try: + _check_dts_bounds(&dts) + except ValueError: + if coerce: + iresult[i] = iNaT + continue + raise except: if coerce: iresult[i] = iNaT @@ -1000,6 +1036,18 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, raise return result + except OutOfBoundsDatetime: + if raise_: + raise + + oresult = np.empty(n, dtype=object) + for i in range(n): + val = values[i] + if util.is_datetime64_object(val): + oresult[i] = val.item() + else: + oresult[i] = val + return oresult except TypeError: oresult = np.empty(n, dtype=object) @@ -1014,6 +1062,8 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, continue try: oresult[i] = parse_datetime_string(val, dayfirst=dayfirst) + _pydatetime_to_dts(oresult[i], &dts) + _check_dts_bounds(&dts) except Exception: if raise_: raise @@ -1320,7 +1370,7 @@ def array_strptime(ndarray[object] values, object fmt): dts.us = fraction iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) - _check_dts_bounds(iresult[i], &dts) + _check_dts_bounds(&dts) return result @@ -1339,6 +1389,7 @@ cdef inline _get_datetime64_nanos(object val): if unit != PANDAS_FR_ns: pandas_datetime_to_datetimestruct(ival, unit, &dts) + _check_dts_bounds(&dts) return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) else: return ival @@ -1398,6 +1449,7 @@ def cast_to_nanoseconds(ndarray arr): for i in range(n): pandas_datetime_to_datetimestruct(ivalues[i], unit, &dts) iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + _check_dts_bounds(&dts) return result