Skip to content

Commit 0076151

Browse files
committed
BUG: Bug in .to_datetime() when passing integers or floats, no unit and errors=coerce
closes #13180
1 parent 4b50149 commit 0076151

File tree

5 files changed

+176
-81
lines changed

5 files changed

+176
-81
lines changed

doc/source/whatsnew/v0.18.2.txt

+13-1
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,26 @@ New Behavior:
7676
type(s.tolist()[0])
7777

7878

79+
.. _whatsnew_0182.api.to_datetime_coerce:
7980

81+
``.to_datetime()`` when coercing
82+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
8083

84+
A bug is fixed in ``.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`).
85+
Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``.
8186

87+
Previous Behavior:
8288

89+
.. code-block:: ipython
8390

91+
In [2]: pd.to_datetime([1, 'foo'], errors='coerce')
92+
Out[2]: DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None)
8493

94+
This will now convert integers/floats with the default unit of ``ns``.
8595

96+
.. ipython:: python
97+
98+
pd.to_datetime([1, 'foo'], errors='coerce')
8699

87100
.. _whatsnew_0182.api.other:
88101

@@ -136,7 +149,6 @@ Bug Fixes
136149

137150

138151

139-
140152
- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`)
141153

142154

pandas/tests/series/test_internals.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ def test_convert_objects(self):
103103
with tm.assert_produces_warning(FutureWarning):
104104
result = s.convert_objects(convert_dates='coerce',
105105
convert_numeric=False)
106-
assert_series_equal(result, s)
106+
expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2)
107+
assert_series_equal(result, expected)
107108

108109
# preserver if non-object
109110
s = Series([1], dtype='float32')
@@ -270,7 +271,7 @@ def test_convert(self):
270271

271272
s = Series(['foo', 'bar', 1, 1.0], dtype='O')
272273
result = s._convert(datetime=True, coerce=True)
273-
expected = Series([lib.NaT] * 4)
274+
expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2)
274275
assert_series_equal(result, expected)
275276

276277
# preserver if non-object

pandas/tseries/tests/test_timeseries.py

+126-62
Original file line numberDiff line numberDiff line change
@@ -762,6 +762,15 @@ def test_to_datetime_unit(self):
762762
with self.assertRaises(ValueError):
763763
to_datetime([1, 2, 111111111], unit='D')
764764

765+
# coerce we can process
766+
expected = DatetimeIndex([Timestamp('1970-01-02'),
767+
Timestamp('1970-01-03')] + ['NaT'] * 1)
768+
result = to_datetime([1, 2, 'foo'], unit='D', errors='coerce')
769+
tm.assert_index_equal(result, expected)
770+
771+
result = to_datetime([1, 2, 111111111], unit='D', errors='coerce')
772+
tm.assert_index_equal(result, expected)
773+
765774
def test_series_ctor_datetime64(self):
766775
rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
767776
dates = np.asarray(rng)
@@ -2283,6 +2292,123 @@ def test_to_datetime_tz_psycopg2(self):
22832292
dtype='datetime64[ns, UTC]')
22842293
tm.assert_index_equal(result, expected)
22852294

2295+
def test_unit(self):
2296+
# GH 11758
2297+
# test proper behavior with erros
2298+
2299+
with self.assertRaises(ValueError):
2300+
to_datetime([1], unit='D', format='%Y%m%d')
2301+
2302+
values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan,
2303+
'NaT', '']
2304+
result = to_datetime(values, unit='D', errors='ignore')
2305+
expected = Index([11111111, Timestamp('1970-01-02'),
2306+
Timestamp('1970-01-02'), pd.NaT,
2307+
pd.NaT, pd.NaT, pd.NaT, pd.NaT],
2308+
dtype=object)
2309+
tm.assert_index_equal(result, expected)
2310+
2311+
result = to_datetime(values, unit='D', errors='coerce')
2312+
expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02',
2313+
'NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
2314+
tm.assert_index_equal(result, expected)
2315+
2316+
with self.assertRaises(tslib.OutOfBoundsDatetime):
2317+
to_datetime(values, unit='D', errors='raise')
2318+
2319+
values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT']
2320+
2321+
result = to_datetime(values, errors='ignore', unit='s')
2322+
expected = Index([1420043460000, pd.NaT, pd.NaT,
2323+
pd.NaT, pd.NaT], dtype=object)
2324+
tm.assert_index_equal(result, expected)
2325+
2326+
result = to_datetime(values, errors='coerce', unit='s')
2327+
expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
2328+
tm.assert_index_equal(result, expected)
2329+
2330+
with self.assertRaises(tslib.OutOfBoundsDatetime):
2331+
to_datetime(values, errors='raise', unit='s')
2332+
2333+
# if we have a string, then we raise a ValueError
2334+
# and NOT an OutOfBoundsDatetime
2335+
for val in ['foo', Timestamp('20130101')]:
2336+
try:
2337+
to_datetime(val, errors='raise', unit='s')
2338+
except tslib.OutOfBoundsDatetime:
2339+
raise AssertionError("incorrect exception raised")
2340+
except ValueError:
2341+
pass
2342+
2343+
def test_unit_consistency(self):
2344+
2345+
# consistency of conversions
2346+
expected = Timestamp('1970-05-09 14:25:11')
2347+
result = pd.to_datetime(11111111, unit='s', errors='raise')
2348+
self.assertEqual(result, expected)
2349+
self.assertIsInstance(result, Timestamp)
2350+
2351+
result = pd.to_datetime(11111111, unit='s', errors='coerce')
2352+
self.assertEqual(result, expected)
2353+
self.assertIsInstance(result, Timestamp)
2354+
2355+
result = pd.to_datetime(11111111, unit='s', errors='ignore')
2356+
self.assertEqual(result, expected)
2357+
self.assertIsInstance(result, Timestamp)
2358+
2359+
def test_unit_with_numeric(self):
2360+
2361+
# GH 13180
2362+
# coercions from floats/ints are ok
2363+
expected = DatetimeIndex(['2015-06-19 05:33:20',
2364+
'2015-05-27 22:33:20'])
2365+
arr1 = [1.434692e+18, 1.432766e+18]
2366+
arr2 = np.array(arr1).astype(int)
2367+
for errors in ['ignore', 'raise', 'coerce']:
2368+
result = pd.to_datetime(arr1, errors=errors)
2369+
tm.assert_index_equal(result, expected)
2370+
2371+
result = pd.to_datetime(arr2, errors=errors)
2372+
tm.assert_index_equal(result, expected)
2373+
2374+
# but we want to make sure that we are coercing
2375+
# if we have ints/strings
2376+
expected = DatetimeIndex(['NaT',
2377+
'2015-06-19 05:33:20',
2378+
'2015-05-27 22:33:20'])
2379+
arr = ['foo', 1.434692e+18, 1.432766e+18]
2380+
result = pd.to_datetime(arr, errors='coerce')
2381+
tm.assert_index_equal(result, expected)
2382+
2383+
expected = DatetimeIndex(['2015-06-19 05:33:20',
2384+
'2015-05-27 22:33:20',
2385+
'NaT',
2386+
'NaT'])
2387+
arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT']
2388+
result = pd.to_datetime(arr, errors='coerce')
2389+
tm.assert_index_equal(result, expected)
2390+
2391+
def test_unit_mixed(self):
2392+
2393+
# mixed integers/datetimes
2394+
expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT'])
2395+
arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18]
2396+
result = pd.to_datetime(arr, errors='coerce')
2397+
tm.assert_index_equal(result, expected)
2398+
2399+
with self.assertRaises(ValueError):
2400+
pd.to_datetime(arr, errors='raise')
2401+
2402+
expected = DatetimeIndex(['NaT',
2403+
'NaT',
2404+
'2013-01-01'])
2405+
arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')]
2406+
result = pd.to_datetime(arr, errors='coerce')
2407+
tm.assert_index_equal(result, expected)
2408+
2409+
with self.assertRaises(ValueError):
2410+
pd.to_datetime(arr, errors='raise')
2411+
22862412
def test_index_to_datetime(self):
22872413
idx = Index(['1/1/2000', '1/2/2000', '1/3/2000'])
22882414

@@ -4229,68 +4355,6 @@ def check(val, unit=None, h=1, s=1, us=0):
42294355
result = Timestamp('NaT')
42304356
self.assertIs(result, NaT)
42314357

4232-
def test_unit_errors(self):
4233-
# GH 11758
4234-
# test proper behavior with erros
4235-
4236-
with self.assertRaises(ValueError):
4237-
to_datetime([1], unit='D', format='%Y%m%d')
4238-
4239-
values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan,
4240-
'NaT', '']
4241-
result = to_datetime(values, unit='D', errors='ignore')
4242-
expected = Index([11111111, Timestamp('1970-01-02'),
4243-
Timestamp('1970-01-02'), pd.NaT,
4244-
pd.NaT, pd.NaT, pd.NaT, pd.NaT],
4245-
dtype=object)
4246-
tm.assert_index_equal(result, expected)
4247-
4248-
result = to_datetime(values, unit='D', errors='coerce')
4249-
expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02',
4250-
'NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
4251-
tm.assert_index_equal(result, expected)
4252-
4253-
with self.assertRaises(tslib.OutOfBoundsDatetime):
4254-
to_datetime(values, unit='D', errors='raise')
4255-
4256-
values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT']
4257-
4258-
result = to_datetime(values, errors='ignore', unit='s')
4259-
expected = Index([1420043460000, pd.NaT, pd.NaT,
4260-
pd.NaT, pd.NaT], dtype=object)
4261-
tm.assert_index_equal(result, expected)
4262-
4263-
result = to_datetime(values, errors='coerce', unit='s')
4264-
expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT'])
4265-
tm.assert_index_equal(result, expected)
4266-
4267-
with self.assertRaises(tslib.OutOfBoundsDatetime):
4268-
to_datetime(values, errors='raise', unit='s')
4269-
4270-
# if we have a string, then we raise a ValueError
4271-
# and NOT an OutOfBoundsDatetime
4272-
for val in ['foo', Timestamp('20130101')]:
4273-
try:
4274-
to_datetime(val, errors='raise', unit='s')
4275-
except tslib.OutOfBoundsDatetime:
4276-
raise AssertionError("incorrect exception raised")
4277-
except ValueError:
4278-
pass
4279-
4280-
# consistency of conversions
4281-
expected = Timestamp('1970-05-09 14:25:11')
4282-
result = pd.to_datetime(11111111, unit='s', errors='raise')
4283-
self.assertEqual(result, expected)
4284-
self.assertIsInstance(result, Timestamp)
4285-
4286-
result = pd.to_datetime(11111111, unit='s', errors='coerce')
4287-
self.assertEqual(result, expected)
4288-
self.assertIsInstance(result, Timestamp)
4289-
4290-
result = pd.to_datetime(11111111, unit='s', errors='ignore')
4291-
self.assertEqual(result, expected)
4292-
self.assertIsInstance(result, Timestamp)
4293-
42944358
def test_roundtrip(self):
42954359

42964360
# test value to string and back conversions

pandas/tseries/tools.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,8 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
221221
- If True, require an exact format match.
222222
- If False, allow the format to match anywhere in the target string.
223223
224-
unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
224+
unit : string, default 'ns'
225+
unit of the arg (D,s,ms,us,ns) denote the unit in epoch
225226
(e.g. a unix timestamp), which is an integer/float number.
226227
infer_datetime_format : boolean, default False
227228
If True and no `format` is given, attempt to infer the format of the

pandas/tslib.pyx

+32-15
Original file line numberDiff line numberDiff line change
@@ -2082,6 +2082,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'):
20822082
unit))
20832083
elif is_ignore:
20842084
raise AssertionError
2085+
iresult[i] = NPY_NAT
20852086
except:
20862087
if is_raise:
20872088
raise OutOfBoundsDatetime("cannot convert input {0}"
@@ -2149,7 +2150,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
21492150
ndarray[int64_t] iresult
21502151
ndarray[object] oresult
21512152
pandas_datetimestruct dts
2152-
bint utc_convert = bool(utc), seen_integer=0, seen_datetime=0
2153+
bint utc_convert = bool(utc), seen_integer=0, seen_string=0, seen_datetime=0
21532154
bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce'
21542155
_TSObject _ts
21552156
int out_local=0, out_tzoffset=0
@@ -2215,25 +2216,32 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
22152216
continue
22162217
raise
22172218

2218-
# if we are coercing, dont' allow integers
2219-
elif is_integer_object(val) and not is_coerce:
2220-
if val == NPY_NAT:
2219+
# these must be ns unit by-definition
2220+
elif is_integer_object(val) or is_float_object(val):
2221+
2222+
if val != val or val == NPY_NAT:
22212223
iresult[i] = NPY_NAT
2222-
else:
2224+
elif is_raise or is_ignore:
22232225
iresult[i] = val
22242226
seen_integer=1
2225-
elif is_float_object(val) and not is_coerce:
2226-
if val != val or val == NPY_NAT:
2227-
iresult[i] = NPY_NAT
22282227
else:
2229-
iresult[i] = <int64_t>val
2230-
seen_integer=1
2228+
# coerce
2229+
# we now need to parse this as if unit='ns'
2230+
# we can ONLY accept integers at this point
2231+
# if we have previously (or in future accept
2232+
# datetimes/strings, then we must coerce)
2233+
seen_integer = 1
2234+
try:
2235+
iresult[i] = cast_from_unit(val, 'ns')
2236+
except:
2237+
iresult[i] = NPY_NAT
22312238
else:
22322239
try:
22332240
if len(val) == 0 or val in _nat_strings:
22342241
iresult[i] = NPY_NAT
22352242
continue
22362243

2244+
seen_string=1
22372245
_string_to_dts(val, &dts, &out_local, &out_tzoffset)
22382246
value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts)
22392247
if out_local == 1:
@@ -2276,11 +2284,20 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
22762284
continue
22772285
raise
22782286

2279-
# don't allow mixed integers and datetime like
2280-
# higher levels can catch and is_coerce to object, for
2281-
# example
2282-
if seen_integer and seen_datetime:
2283-
raise ValueError("mixed datetimes and integers in passed array")
2287+
if seen_datetime and seen_integer:
2288+
# we have mixed datetimes & integers
2289+
2290+
if is_coerce:
2291+
# coerce all of the integers/floats to NaT, preserve
2292+
# the datetimes and other convertibles
2293+
for i in range(n):
2294+
val = values[i]
2295+
if is_integer_object(val) or is_float_object(val):
2296+
result[i] = NPY_NAT
2297+
elif is_raise:
2298+
raise ValueError("mixed datetimes and integers in passed array")
2299+
else:
2300+
raise TypeError
22842301

22852302
return result
22862303
except OutOfBoundsDatetime:

0 commit comments

Comments
 (0)