diff --git a/doc/source/release.rst b/doc/source/release.rst index ffb792ca98da5..b71285758d53b 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -433,6 +433,7 @@ Bug Fixes - Fix an issue in TextFileReader w/ Python engine (i.e. PythonParser) with thousands != "," (:issue:`4596`) - Bug in getitem with a duplicate index when using where (:issue:`4879`) + - Fix Type inference code coerces float column into datetime (:issue:`4601`) pandas 0.12.0 diff --git a/pandas/tests/test_tslib.py b/pandas/tests/test_tslib.py new file mode 100644 index 0000000000000..b9a7356412a10 --- /dev/null +++ b/pandas/tests/test_tslib.py @@ -0,0 +1,123 @@ +import unittest + +import numpy as np + +from pandas import tslib +from datetime import datetime + +class TestDatetimeParsingWrappers(unittest.TestCase): + def test_verify_datetime_bounds(self): + for year in (1, 1000, 1677, 2262, 5000): + dt = datetime(year, 1, 1) + self.assertRaises( + ValueError, + tslib.verify_datetime_bounds, + dt + ) + + for year in (1678, 2000, 2261): + tslib.verify_datetime_bounds(datetime(year, 1, 1)) + + def test_does_not_convert_mixed_integer(self): + bad_date_strings = ( + '-50000', + '999', + '123.1234', + 'm', + 'T' + ) + + for bad_date_string in bad_date_strings: + self.assertFalse( + tslib._does_string_look_like_datetime(bad_date_string) + ) + + good_date_strings = ( + '2012-01-01', + '01/01/2012', + 'Mon Sep 16, 2013', + '01012012', + '0101', + '1-1', + ) + + for good_date_string in good_date_strings: + self.assertTrue( + tslib._does_string_look_like_datetime(good_date_string) + ) + +class TestArrayToDatetime(unittest.TestCase): + def test_parsing_valid_dates(self): + arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) + self.assert_( + np.array_equal( + tslib.array_to_datetime(arr), + np.array( + [ + '2013-01-01T00:00:00.000000000-0000', + '2013-01-02T00:00:00.000000000-0000' + ], + dtype='M8[ns]' + ) + ) + ) + + arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) + self.assert_( + np.array_equal( + tslib.array_to_datetime(arr), + np.array( + [ + '2013-09-16T00:00:00.000000000-0000', + '2013-09-17T00:00:00.000000000-0000' + ], + dtype='M8[ns]' + ) + ) + ) + + def test_number_looking_strings_not_into_datetime(self): + # #4601 + # These strings don't look like datetimes so they shouldn't be + # attempted to be converted + arr = np.array(['-352.737091', '183.575577'], dtype=object) + self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + + arr = np.array(['1', '2', '3', '4', '5'], dtype=object) + self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + + def test_dates_outside_of_datetime64_ns_bounds(self): + # These datetimes are outside of the bounds of the + # datetime64[ns] bounds, so they cannot be converted to + # datetimes + arr = np.array(['1/1/1676', '1/2/1676'], dtype=object) + self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + + arr = np.array(['1/1/2263', '1/2/2263'], dtype=object) + self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + + def test_coerce_of_invalid_datetimes(self): + arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object) + + # Without coercing, the presence of any invalid dates prevents + # any values from being converted + self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + + # With coercing, the invalid dates becomes iNaT + self.assert_( + np.array_equal( + tslib.array_to_datetime(arr, coerce=True), + np.array( + [ + '2013-01-01T00:00:00.000000000-0000', + tslib.iNaT, + tslib.iNaT + ], + dtype='M8[ns]' + ) + ) + ) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index fd97512b0528b..075102dd63100 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -317,7 +317,6 @@ class Timestamp(_Timestamp): _nat_strings = set(['NaT','nat','NAT','nan','NaN','NAN']) -_not_datelike_strings = set(['a','A','m','M','p','P','t','T']) class NaTType(_NaT): """(N)ot-(A)-(T)ime, the time equivalent of NaN""" @@ -841,6 +840,43 @@ def datetime_to_datetime64(ndarray[object] values): return result, inferred_tz +_not_datelike_strings = set(['a','A','m','M','p','P','t','T']) + +def verify_datetime_bounds(dt): + """Verify datetime.datetime is within the datetime64[ns] bounds.""" + if dt.year <= 1677 or dt.year >= 2262: + raise ValueError( + 'Given datetime not within valid datetime64[ns] bounds' + ) + return dt + +def _does_string_look_like_datetime(date_string): + if date_string.startswith('0'): + # Strings starting with 0 are more consistent with a + # date-like string than a number + return True + + try: + if float(date_string) < 1000: + return False + except ValueError: + pass + + if date_string in _not_datelike_strings: + return False + + return True + +def parse_datetime_string(date_string, verify_bounds=True, **kwargs): + if not _does_string_look_like_datetime(date_string): + raise ValueError('Given date string not likely a datetime.') + + dt = parse_date(date_string, **kwargs) + + if verify_bounds: + verify_datetime_bounds(dt) + + return dt def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, format=None, utc=None, coerce=False, unit=None): @@ -908,24 +944,15 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, &dts) _check_dts_bounds(iresult[i], &dts) except ValueError: - - # for some reason, dateutil parses some single letter len-1 strings into today's date - if len(val) == 1 and val in _not_datelike_strings: - if coerce: - iresult[i] = iNaT - continue - elif raise_: - raise try: - result[i] = parse_date(val, dayfirst=dayfirst) + result[i] = parse_datetime_string( + val, dayfirst=dayfirst + ) except Exception: if coerce: iresult[i] = iNaT continue raise TypeError - pandas_datetime_to_datetimestruct(iresult[i], PANDAS_FR_ns, - &dts) - _check_dts_bounds(iresult[i], &dts) except: if coerce: iresult[i] = iNaT @@ -946,7 +973,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, oresult[i] = 'NaT' continue try: - oresult[i] = parse_date(val, dayfirst=dayfirst) + oresult[i] = parse_datetime_string(val, dayfirst=dayfirst) except Exception: if raise_: raise