Skip to content

Commit f738712

Browse files
committed
BUG: Constrain date parsing from strings a little bit more #4601
Currently dateutil will parse almost any string into a datetime. This change adds a filter in front of dateutil that will prevent it from parsing certain strings that don't look like datetimes: 1) Strings that parse to float values that are less than 1000 2) Certain special one character strings (this was already in there, this just moves that code) Additionally, this filters out datetimes that are out of range for the datetime64[ns] type. Currently any out-of-range datetimes will just overflow and be mapped to some random time within the bounds of datetime64[ns].
1 parent f80a666 commit f738712

File tree

3 files changed

+165
-14
lines changed

3 files changed

+165
-14
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,7 @@ Bug Fixes
433433
- Fix an issue in TextFileReader w/ Python engine (i.e. PythonParser)
434434
with thousands != "," (:issue:`4596`)
435435
- Bug in getitem with a duplicate index when using where (:issue:`4879`)
436+
- Fix Type inference code coerces float column into datetime (:issue:`4601`)
436437

437438

438439
pandas 0.12.0

pandas/tests/test_tslib.py

+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import unittest
2+
3+
import numpy as np
4+
5+
from pandas import tslib
6+
from datetime import datetime
7+
8+
class TestDatetimeParsingWrappers(unittest.TestCase):
9+
def test_verify_datetime_bounds(self):
10+
for year in (1, 1000, 1677, 2262, 5000):
11+
dt = datetime(year, 1, 1)
12+
self.assertRaises(
13+
ValueError,
14+
tslib.verify_datetime_bounds,
15+
dt
16+
)
17+
18+
for year in (1678, 2000, 2261):
19+
tslib.verify_datetime_bounds(datetime(year, 1, 1))
20+
21+
def test_does_not_convert_mixed_integer(self):
22+
bad_date_strings = (
23+
'-50000',
24+
'999',
25+
'123.1234',
26+
'm',
27+
'T'
28+
)
29+
30+
for bad_date_string in bad_date_strings:
31+
self.assertFalse(
32+
tslib._does_string_look_like_datetime(bad_date_string)
33+
)
34+
35+
good_date_strings = (
36+
'2012-01-01',
37+
'01/01/2012',
38+
'Mon Sep 16, 2013',
39+
'01012012',
40+
'0101',
41+
'1-1',
42+
)
43+
44+
for good_date_string in good_date_strings:
45+
self.assertTrue(
46+
tslib._does_string_look_like_datetime(good_date_string)
47+
)
48+
49+
class TestArrayToDatetime(unittest.TestCase):
50+
def test_parsing_valid_dates(self):
51+
arr = np.array(['01-01-2013', '01-02-2013'], dtype=object)
52+
self.assert_(
53+
np.array_equal(
54+
tslib.array_to_datetime(arr),
55+
np.array(
56+
[
57+
'2013-01-01T00:00:00.000000000-0000',
58+
'2013-01-02T00:00:00.000000000-0000'
59+
],
60+
dtype='M8[ns]'
61+
)
62+
)
63+
)
64+
65+
arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object)
66+
self.assert_(
67+
np.array_equal(
68+
tslib.array_to_datetime(arr),
69+
np.array(
70+
[
71+
'2013-09-16T00:00:00.000000000-0000',
72+
'2013-09-17T00:00:00.000000000-0000'
73+
],
74+
dtype='M8[ns]'
75+
)
76+
)
77+
)
78+
79+
def test_number_looking_strings_not_into_datetime(self):
80+
# #4601
81+
# These strings don't look like datetimes so they shouldn't be
82+
# attempted to be converted
83+
arr = np.array(['-352.737091', '183.575577'], dtype=object)
84+
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))
85+
86+
arr = np.array(['1', '2', '3', '4', '5'], dtype=object)
87+
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))
88+
89+
def test_dates_outside_of_datetime64_ns_bounds(self):
90+
# These datetimes are outside of the bounds of the
91+
# datetime64[ns] bounds, so they cannot be converted to
92+
# datetimes
93+
arr = np.array(['1/1/1676', '1/2/1676'], dtype=object)
94+
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))
95+
96+
arr = np.array(['1/1/2263', '1/2/2263'], dtype=object)
97+
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))
98+
99+
def test_coerce_of_invalid_datetimes(self):
100+
arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object)
101+
102+
# Without coercing, the presence of any invalid dates prevents
103+
# any values from being converted
104+
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))
105+
106+
# With coercing, the invalid dates becomes iNaT
107+
self.assert_(
108+
np.array_equal(
109+
tslib.array_to_datetime(arr, coerce=True),
110+
np.array(
111+
[
112+
'2013-01-01T00:00:00.000000000-0000',
113+
tslib.iNaT,
114+
tslib.iNaT
115+
],
116+
dtype='M8[ns]'
117+
)
118+
)
119+
)
120+
121+
if __name__ == '__main__':
122+
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
123+
exit=False)

pandas/tslib.pyx

+41-14
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,6 @@ class Timestamp(_Timestamp):
317317

318318

319319
_nat_strings = set(['NaT','nat','NAT','nan','NaN','NAN'])
320-
_not_datelike_strings = set(['a','A','m','M','p','P','t','T'])
321320
class NaTType(_NaT):
322321
"""(N)ot-(A)-(T)ime, the time equivalent of NaN"""
323322

@@ -841,6 +840,43 @@ def datetime_to_datetime64(ndarray[object] values):
841840

842841
return result, inferred_tz
843842

843+
_not_datelike_strings = set(['a','A','m','M','p','P','t','T'])
844+
845+
def verify_datetime_bounds(dt):
846+
"""Verify datetime.datetime is within the datetime64[ns] bounds."""
847+
if dt.year <= 1677 or dt.year >= 2262:
848+
raise ValueError(
849+
'Given datetime not within valid datetime64[ns] bounds'
850+
)
851+
return dt
852+
853+
def _does_string_look_like_datetime(date_string):
854+
if date_string.startswith('0'):
855+
# Strings starting with 0 are more consistent with a
856+
# date-like string than a number
857+
return True
858+
859+
try:
860+
if float(date_string) < 1000:
861+
return False
862+
except ValueError:
863+
pass
864+
865+
if date_string in _not_datelike_strings:
866+
return False
867+
868+
return True
869+
870+
def parse_datetime_string(date_string, verify_bounds=True, **kwargs):
871+
if not _does_string_look_like_datetime(date_string):
872+
raise ValueError('Given date string not likely a datetime.')
873+
874+
dt = parse_date(date_string, **kwargs)
875+
876+
if verify_bounds:
877+
verify_datetime_bounds(dt)
878+
879+
return dt
844880

845881
def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
846882
format=None, utc=None, coerce=False, unit=None):
@@ -908,24 +944,15 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
908944
&dts)
909945
_check_dts_bounds(iresult[i], &dts)
910946
except ValueError:
911-
912-
# for some reason, dateutil parses some single letter len-1 strings into today's date
913-
if len(val) == 1 and val in _not_datelike_strings:
914-
if coerce:
915-
iresult[i] = iNaT
916-
continue
917-
elif raise_:
918-
raise
919947
try:
920-
result[i] = parse_date(val, dayfirst=dayfirst)
948+
result[i] = parse_datetime_string(
949+
val, dayfirst=dayfirst
950+
)
921951
except Exception:
922952
if coerce:
923953
iresult[i] = iNaT
924954
continue
925955
raise TypeError
926-
pandas_datetime_to_datetimestruct(iresult[i], PANDAS_FR_ns,
927-
&dts)
928-
_check_dts_bounds(iresult[i], &dts)
929956
except:
930957
if coerce:
931958
iresult[i] = iNaT
@@ -946,7 +973,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
946973
oresult[i] = 'NaT'
947974
continue
948975
try:
949-
oresult[i] = parse_date(val, dayfirst=dayfirst)
976+
oresult[i] = parse_datetime_string(val, dayfirst=dayfirst)
950977
except Exception:
951978
if raise_:
952979
raise

0 commit comments

Comments
 (0)