Skip to content

BUG: Bug in .to_datetime() when passing integers or floats, no unit and errors=coerce #13183

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,26 @@ New Behavior:
type(s.tolist()[0])


.. _whatsnew_0182.api.to_datetime_coerce:

``.to_datetime()`` when coercing
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

A bug is fixed in ``.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`).
Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``.

Previous Behavior:

.. code-block:: ipython

In [2]: pd.to_datetime([1, 'foo'], errors='coerce')
Out[2]: DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None)

This will now convert integers/floats with the default unit of ``ns``.

.. ipython:: python

pd.to_datetime([1, 'foo'], errors='coerce')

.. _whatsnew_0182.api.other:

Expand Down Expand Up @@ -136,7 +149,6 @@ Bug Fixes




- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`)


Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/series/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ def test_convert_objects(self):
with tm.assert_produces_warning(FutureWarning):
result = s.convert_objects(convert_dates='coerce',
convert_numeric=False)
assert_series_equal(result, s)
expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2)
assert_series_equal(result, expected)

# preserver if non-object
s = Series([1], dtype='float32')
Expand Down Expand Up @@ -270,7 +271,7 @@ def test_convert(self):

s = Series(['foo', 'bar', 1, 1.0], dtype='O')
result = s._convert(datetime=True, coerce=True)
expected = Series([lib.NaT] * 4)
expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2)
assert_series_equal(result, expected)

# preserver if non-object
Expand Down
188 changes: 126 additions & 62 deletions pandas/tseries/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,15 @@ def test_to_datetime_unit(self):
with self.assertRaises(ValueError):
to_datetime([1, 2, 111111111], unit='D')

# coerce we can process
expected = DatetimeIndex([Timestamp('1970-01-02'),
Timestamp('1970-01-03')] + ['NaT'] * 1)
result = to_datetime([1, 2, 'foo'], unit='D', errors='coerce')
tm.assert_index_equal(result, expected)

result = to_datetime([1, 2, 111111111], unit='D', errors='coerce')
tm.assert_index_equal(result, expected)

def test_series_ctor_datetime64(self):
rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
dates = np.asarray(rng)
Expand Down Expand Up @@ -2283,6 +2292,123 @@ def test_to_datetime_tz_psycopg2(self):
dtype='datetime64[ns, UTC]')
tm.assert_index_equal(result, expected)

def test_unit(self):
# GH 11758
# test proper behavior with erros

with self.assertRaises(ValueError):
to_datetime([1], unit='D', format='%Y%m%d')

values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan,
'NaT', '']
result = to_datetime(values, unit='D', errors='ignore')
expected = Index([11111111, Timestamp('1970-01-02'),
Timestamp('1970-01-02'), pd.NaT,
pd.NaT, pd.NaT, pd.NaT, pd.NaT],
dtype=object)
tm.assert_index_equal(result, expected)

result = to_datetime(values, unit='D', errors='coerce')
expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02',
'NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
tm.assert_index_equal(result, expected)

with self.assertRaises(tslib.OutOfBoundsDatetime):
to_datetime(values, unit='D', errors='raise')

values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT']

result = to_datetime(values, errors='ignore', unit='s')
expected = Index([1420043460000, pd.NaT, pd.NaT,
pd.NaT, pd.NaT], dtype=object)
tm.assert_index_equal(result, expected)

result = to_datetime(values, errors='coerce', unit='s')
expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
tm.assert_index_equal(result, expected)

with self.assertRaises(tslib.OutOfBoundsDatetime):
to_datetime(values, errors='raise', unit='s')

# if we have a string, then we raise a ValueError
# and NOT an OutOfBoundsDatetime
for val in ['foo', Timestamp('20130101')]:
try:
to_datetime(val, errors='raise', unit='s')
except tslib.OutOfBoundsDatetime:
raise AssertionError("incorrect exception raised")
except ValueError:
pass

def test_unit_consistency(self):

# consistency of conversions
expected = Timestamp('1970-05-09 14:25:11')
result = pd.to_datetime(11111111, unit='s', errors='raise')
self.assertEqual(result, expected)
self.assertIsInstance(result, Timestamp)

result = pd.to_datetime(11111111, unit='s', errors='coerce')
self.assertEqual(result, expected)
self.assertIsInstance(result, Timestamp)

result = pd.to_datetime(11111111, unit='s', errors='ignore')
self.assertEqual(result, expected)
self.assertIsInstance(result, Timestamp)

def test_unit_with_numeric(self):

# GH 13180
# coercions from floats/ints are ok
expected = DatetimeIndex(['2015-06-19 05:33:20',
'2015-05-27 22:33:20'])
arr1 = [1.434692e+18, 1.432766e+18]
arr2 = np.array(arr1).astype(int)
for errors in ['ignore', 'raise', 'coerce']:
result = pd.to_datetime(arr1, errors=errors)
tm.assert_index_equal(result, expected)

result = pd.to_datetime(arr2, errors=errors)
tm.assert_index_equal(result, expected)

# but we want to make sure that we are coercing
# if we have ints/strings
expected = DatetimeIndex(['NaT',
'2015-06-19 05:33:20',
'2015-05-27 22:33:20'])
arr = ['foo', 1.434692e+18, 1.432766e+18]
result = pd.to_datetime(arr, errors='coerce')
tm.assert_index_equal(result, expected)

expected = DatetimeIndex(['2015-06-19 05:33:20',
'2015-05-27 22:33:20',
'NaT',
'NaT'])
arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT']
result = pd.to_datetime(arr, errors='coerce')
tm.assert_index_equal(result, expected)

def test_unit_mixed(self):

# mixed integers/datetimes
expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT'])
arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18]
result = pd.to_datetime(arr, errors='coerce')
tm.assert_index_equal(result, expected)

with self.assertRaises(ValueError):
pd.to_datetime(arr, errors='raise')

expected = DatetimeIndex(['NaT',
'NaT',
'2013-01-01'])
arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')]
result = pd.to_datetime(arr, errors='coerce')
tm.assert_index_equal(result, expected)

with self.assertRaises(ValueError):
pd.to_datetime(arr, errors='raise')

def test_index_to_datetime(self):
idx = Index(['1/1/2000', '1/2/2000', '1/3/2000'])

Expand Down Expand Up @@ -4229,68 +4355,6 @@ def check(val, unit=None, h=1, s=1, us=0):
result = Timestamp('NaT')
self.assertIs(result, NaT)

def test_unit_errors(self):
# GH 11758
# test proper behavior with erros

with self.assertRaises(ValueError):
to_datetime([1], unit='D', format='%Y%m%d')

values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan,
'NaT', '']
result = to_datetime(values, unit='D', errors='ignore')
expected = Index([11111111, Timestamp('1970-01-02'),
Timestamp('1970-01-02'), pd.NaT,
pd.NaT, pd.NaT, pd.NaT, pd.NaT],
dtype=object)
tm.assert_index_equal(result, expected)

result = to_datetime(values, unit='D', errors='coerce')
expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02',
'NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
tm.assert_index_equal(result, expected)

with self.assertRaises(tslib.OutOfBoundsDatetime):
to_datetime(values, unit='D', errors='raise')

values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT']

result = to_datetime(values, errors='ignore', unit='s')
expected = Index([1420043460000, pd.NaT, pd.NaT,
pd.NaT, pd.NaT], dtype=object)
tm.assert_index_equal(result, expected)

result = to_datetime(values, errors='coerce', unit='s')
expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
tm.assert_index_equal(result, expected)

with self.assertRaises(tslib.OutOfBoundsDatetime):
to_datetime(values, errors='raise', unit='s')

# if we have a string, then we raise a ValueError
# and NOT an OutOfBoundsDatetime
for val in ['foo', Timestamp('20130101')]:
try:
to_datetime(val, errors='raise', unit='s')
except tslib.OutOfBoundsDatetime:
raise AssertionError("incorrect exception raised")
except ValueError:
pass

# consistency of conversions
expected = Timestamp('1970-05-09 14:25:11')
result = pd.to_datetime(11111111, unit='s', errors='raise')
self.assertEqual(result, expected)
self.assertIsInstance(result, Timestamp)

result = pd.to_datetime(11111111, unit='s', errors='coerce')
self.assertEqual(result, expected)
self.assertIsInstance(result, Timestamp)

result = pd.to_datetime(11111111, unit='s', errors='ignore')
self.assertEqual(result, expected)
self.assertIsInstance(result, Timestamp)

def test_roundtrip(self):

# test value to string and back conversions
Expand Down
3 changes: 2 additions & 1 deletion pandas/tseries/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,8 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
- If True, require an exact format match.
- If False, allow the format to match anywhere in the target string.

unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
unit : string, default 'ns'
unit of the arg (D,s,ms,us,ns) denote the unit in epoch
(e.g. a unix timestamp), which is an integer/float number.
infer_datetime_format : boolean, default False
If True and no `format` is given, attempt to infer the format of the
Expand Down
47 changes: 32 additions & 15 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2082,6 +2082,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'):
unit))
elif is_ignore:
raise AssertionError
iresult[i] = NPY_NAT
except:
if is_raise:
raise OutOfBoundsDatetime("cannot convert input {0}"
Expand Down Expand Up @@ -2149,7 +2150,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
ndarray[int64_t] iresult
ndarray[object] oresult
pandas_datetimestruct dts
bint utc_convert = bool(utc), seen_integer=0, seen_datetime=0
bint utc_convert = bool(utc), seen_integer=0, seen_string=0, seen_datetime=0
bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce'
_TSObject _ts
int out_local=0, out_tzoffset=0
Expand Down Expand Up @@ -2215,25 +2216,32 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
continue
raise

# if we are coercing, dont' allow integers
elif is_integer_object(val) and not is_coerce:
if val == NPY_NAT:
# these must be ns unit by-definition
elif is_integer_object(val) or is_float_object(val):

if val != val or val == NPY_NAT:
iresult[i] = NPY_NAT
else:
elif is_raise or is_ignore:
iresult[i] = val
seen_integer=1
elif is_float_object(val) and not is_coerce:
if val != val or val == NPY_NAT:
iresult[i] = NPY_NAT
else:
iresult[i] = <int64_t>val
seen_integer=1
# coerce
# we now need to parse this as if unit='ns'
# we can ONLY accept integers at this point
# if we have previously (or in future accept
# datetimes/strings, then we must coerce)
seen_integer = 1
try:
iresult[i] = cast_from_unit(val, 'ns')
except:
iresult[i] = NPY_NAT
else:
try:
if len(val) == 0 or val in _nat_strings:
iresult[i] = NPY_NAT
continue

seen_string=1
_string_to_dts(val, &dts, &out_local, &out_tzoffset)
value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts)
if out_local == 1:
Expand Down Expand Up @@ -2276,11 +2284,20 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
continue
raise

# don't allow mixed integers and datetime like
# higher levels can catch and is_coerce to object, for
# example
if seen_integer and seen_datetime:
raise ValueError("mixed datetimes and integers in passed array")
if seen_datetime and seen_integer:
# we have mixed datetimes & integers

if is_coerce:
# coerce all of the integers/floats to NaT, preserve
# the datetimes and other convertibles
for i in range(n):
val = values[i]
if is_integer_object(val) or is_float_object(val):
result[i] = NPY_NAT
elif is_raise:
raise ValueError("mixed datetimes and integers in passed array")
else:
raise TypeError

return result
except OutOfBoundsDatetime:
Expand Down