Skip to content

BUG: Constrain date parsing from strings a little bit more #4601 #4863

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 20, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ Bug Fixes
- Fix an issue in TextFileReader w/ Python engine (i.e. PythonParser)
with thousands != "," (:issue:`4596`)
- Bug in getitem with a duplicate index when using where (:issue:`4879`)
- Fix Type inference code coerces float column into datetime (:issue:`4601`)


pandas 0.12.0
Expand Down
123 changes: 123 additions & 0 deletions pandas/tests/test_tslib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import unittest

import numpy as np

from pandas import tslib
from datetime import datetime

class TestDatetimeParsingWrappers(unittest.TestCase):
def test_verify_datetime_bounds(self):
for year in (1, 1000, 1677, 2262, 5000):
dt = datetime(year, 1, 1)
self.assertRaises(
ValueError,
tslib.verify_datetime_bounds,
dt
)

for year in (1678, 2000, 2261):
tslib.verify_datetime_bounds(datetime(year, 1, 1))

def test_does_not_convert_mixed_integer(self):
bad_date_strings = (
'-50000',
'999',
'123.1234',
'm',
'T'
)

for bad_date_string in bad_date_strings:
self.assertFalse(
tslib._does_string_look_like_datetime(bad_date_string)
)

good_date_strings = (
'2012-01-01',
'01/01/2012',
'Mon Sep 16, 2013',
'01012012',
'0101',
'1-1',
)

for good_date_string in good_date_strings:
self.assertTrue(
tslib._does_string_look_like_datetime(good_date_string)
)

class TestArrayToDatetime(unittest.TestCase):
def test_parsing_valid_dates(self):
arr = np.array(['01-01-2013', '01-02-2013'], dtype=object)
self.assert_(
np.array_equal(
tslib.array_to_datetime(arr),
np.array(
[
'2013-01-01T00:00:00.000000000-0000',
'2013-01-02T00:00:00.000000000-0000'
],
dtype='M8[ns]'
)
)
)

arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object)
self.assert_(
np.array_equal(
tslib.array_to_datetime(arr),
np.array(
[
'2013-09-16T00:00:00.000000000-0000',
'2013-09-17T00:00:00.000000000-0000'
],
dtype='M8[ns]'
)
)
)

def test_number_looking_strings_not_into_datetime(self):
# #4601
# These strings don't look like datetimes so they shouldn't be
# attempted to be converted
arr = np.array(['-352.737091', '183.575577'], dtype=object)
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))

arr = np.array(['1', '2', '3', '4', '5'], dtype=object)
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))

def test_dates_outside_of_datetime64_ns_bounds(self):
# These datetimes are outside of the bounds of the
# datetime64[ns] bounds, so they cannot be converted to
# datetimes
arr = np.array(['1/1/1676', '1/2/1676'], dtype=object)
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))

arr = np.array(['1/1/2263', '1/2/2263'], dtype=object)
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))

def test_coerce_of_invalid_datetimes(self):
arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object)

# Without coercing, the presence of any invalid dates prevents
# any values from being converted
self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr))

# With coercing, the invalid dates becomes iNaT
self.assert_(
np.array_equal(
tslib.array_to_datetime(arr, coerce=True),
np.array(
[
'2013-01-01T00:00:00.000000000-0000',
tslib.iNaT,
tslib.iNaT
],
dtype='M8[ns]'
)
)
)

if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
55 changes: 41 additions & 14 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,6 @@ class Timestamp(_Timestamp):


_nat_strings = set(['NaT','nat','NAT','nan','NaN','NAN'])
_not_datelike_strings = set(['a','A','m','M','p','P','t','T'])
class NaTType(_NaT):
"""(N)ot-(A)-(T)ime, the time equivalent of NaN"""

Expand Down Expand Up @@ -841,6 +840,43 @@ def datetime_to_datetime64(ndarray[object] values):

return result, inferred_tz

_not_datelike_strings = set(['a','A','m','M','p','P','t','T'])

def verify_datetime_bounds(dt):
"""Verify datetime.datetime is within the datetime64[ns] bounds."""
if dt.year <= 1677 or dt.year >= 2262:
raise ValueError(
'Given datetime not within valid datetime64[ns] bounds'
)
return dt

def _does_string_look_like_datetime(date_string):
if date_string.startswith('0'):
# Strings starting with 0 are more consistent with a
# date-like string than a number
return True

try:
if float(date_string) < 1000:
return False
except ValueError:
pass

if date_string in _not_datelike_strings:
return False

return True

def parse_datetime_string(date_string, verify_bounds=True, **kwargs):
if not _does_string_look_like_datetime(date_string):
raise ValueError('Given date string not likely a datetime.')

dt = parse_date(date_string, **kwargs)

if verify_bounds:
verify_datetime_bounds(dt)

return dt

def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
format=None, utc=None, coerce=False, unit=None):
Expand Down Expand Up @@ -908,24 +944,15 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
&dts)
_check_dts_bounds(iresult[i], &dts)
except ValueError:

# for some reason, dateutil parses some single letter len-1 strings into today's date
if len(val) == 1 and val in _not_datelike_strings:
if coerce:
iresult[i] = iNaT
continue
elif raise_:
raise
try:
result[i] = parse_date(val, dayfirst=dayfirst)
result[i] = parse_datetime_string(
val, dayfirst=dayfirst
)
except Exception:
if coerce:
iresult[i] = iNaT
continue
raise TypeError
pandas_datetime_to_datetimestruct(iresult[i], PANDAS_FR_ns,
&dts)
_check_dts_bounds(iresult[i], &dts)
except:
if coerce:
iresult[i] = iNaT
Expand All @@ -946,7 +973,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
oresult[i] = 'NaT'
continue
try:
oresult[i] = parse_date(val, dayfirst=dayfirst)
oresult[i] = parse_datetime_string(val, dayfirst=dayfirst)
except Exception:
if raise_:
raise
Expand Down