Skip to content

Commit a3cca39

Browse files
committed
Merge pull request #10615 from chris-b1/master
PERF: Improve perf of to_datetime with ISO format
2 parents d25a9f3 + 1f01990 commit a3cca39

File tree

5 files changed

+61
-23
lines changed

5 files changed

+61
-23
lines changed

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ Performance Improvements
328328
- Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)
329329
- Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
330330
- 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`)
331+
- Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`)
331332

332333

333334
.. _whatsnew_0170.bug_fixes:

pandas/tseries/tests/test_timeseries.py

+19-5
Original file line numberDiff line numberDiff line change
@@ -919,8 +919,8 @@ def test_to_datetime_with_apply(self):
919919
assert_series_equal(result, expected)
920920

921921
td = pd.Series(['May 04', 'Jun 02', ''], index=[1,2,3])
922-
self.assertRaises(ValueError, lambda : pd.to_datetime(td,format='%b %y'))
923-
self.assertRaises(ValueError, lambda : td.apply(pd.to_datetime, format='%b %y'))
922+
self.assertRaises(ValueError, lambda : pd.to_datetime(td,format='%b %y', errors='raise'))
923+
self.assertRaises(ValueError, lambda : td.apply(pd.to_datetime, format='%b %y', errors='raise'))
924924
expected = pd.to_datetime(td, format='%b %y', coerce=True)
925925

926926
result = td.apply(lambda x: pd.to_datetime(x, format='%b %y', coerce=True))
@@ -4197,6 +4197,20 @@ def test_to_datetime_format_YYYYMMDD(self):
41974197
expected = Series(['20121231','20141231','NaT'],dtype='M8[ns]')
41984198
assert_series_equal(result, expected)
41994199

4200+
# GH 10178
4201+
def test_to_datetime_format_integer(self):
4202+
s = Series([2000, 2001, 2002])
4203+
expected = Series([ Timestamp(x) for x in s.apply(str) ])
4204+
4205+
result = to_datetime(s,format='%Y')
4206+
assert_series_equal(result, expected)
4207+
4208+
s = Series([200001, 200105, 200206])
4209+
expected = Series([ Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str) ])
4210+
4211+
result = to_datetime(s,format='%Y%m')
4212+
assert_series_equal(result, expected)
4213+
42004214
def test_to_datetime_format_microsecond(self):
42014215
val = '01-Apr-2011 00:00:01.978'
42024216
format = '%d-%b-%Y %H:%M:%S.%f'
@@ -4524,9 +4538,9 @@ def test_day_not_in_month_coerce_false_raise(self):
45244538

45254539
def test_day_not_in_month_coerce_false_ignore(self):
45264540
self.assertEqual(to_datetime('2015-02-29', errors='ignore', coerce=False), '2015-02-29')
4527-
self.assertRaises(ValueError, to_datetime, '2015-02-29', errors='ignore', format="%Y-%m-%d", coerce=False)
4528-
self.assertRaises(ValueError, to_datetime, '2015-02-32', errors='ignore', format="%Y-%m-%d", coerce=False)
4529-
self.assertRaises(ValueError, to_datetime, '2015-04-31', errors='ignore', format="%Y-%m-%d", coerce=False)
4541+
self.assertEqual(to_datetime('2015-02-29', errors='ignore', format="%Y-%m-%d", coerce=False), '2015-02-29')
4542+
self.assertEqual(to_datetime('2015-02-32', errors='ignore', format="%Y-%m-%d", coerce=False), '2015-02-32')
4543+
self.assertEqual(to_datetime('2015-04-31', errors='ignore', format="%Y-%m-%d", coerce=False), '2015-04-31')
45304544

45314545
if __name__ == '__main__':
45324546
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/tseries/tools.py

+23-16
Original file line numberDiff line numberDiff line change
@@ -296,21 +296,24 @@ def _convert_listlike(arg, box, format):
296296
return result
297297

298298
arg = com._ensure_object(arg)
299+
require_iso8601 = False
299300

300301
if infer_datetime_format and format is None:
301302
format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
302303

303-
if format is not None:
304-
# There is a special fast-path for iso8601 formatted
305-
# datetime strings, so in those cases don't use the inferred
306-
# format because this path makes process slower in this
307-
# special case
308-
format_is_iso8601 = (
309-
'%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or
310-
'%Y-%m-%d %H:%M:%S.%f'.startswith(format)
311-
)
312-
if format_is_iso8601:
313-
format = None
304+
if format is not None:
305+
# There is a special fast-path for iso8601 formatted
306+
# datetime strings, so in those cases don't use the inferred
307+
# format because this path makes process slower in this
308+
# special case
309+
format_is_iso8601 = (
310+
('%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or
311+
'%Y-%m-%d %H:%M:%S.%f'.startswith(format)) and
312+
format != '%Y'
313+
)
314+
if format_is_iso8601:
315+
require_iso8601 = not infer_datetime_format
316+
format = None
314317

315318
try:
316319
result = None
@@ -334,16 +337,20 @@ def _convert_listlike(arg, box, format):
334337
raise
335338
result = arg
336339
except ValueError:
337-
# Only raise this error if the user provided the
338-
# datetime format, and not when it was inferred
340+
# if format was inferred, try falling back
341+
# to array_to_datetime - terminate here
342+
# for specified formats
339343
if not infer_datetime_format:
340-
raise
344+
if errors == 'raise':
345+
raise
346+
result = arg
341347

342348
if result is None and (format is None or infer_datetime_format):
343-
result = tslib.array_to_datetime(arg, raise_=errors == 'raise',
349+
result = tslib.array_to_datetime(arg, raise_=errors=='raise',
344350
utc=utc, dayfirst=dayfirst,
345351
yearfirst=yearfirst, freq=freq,
346-
coerce=coerce, unit=unit)
352+
coerce=coerce, unit=unit,
353+
require_iso8601=require_iso8601)
347354

348355
if com.is_datetime64_dtype(result) and box:
349356
result = DatetimeIndex(result, tz='utc' if utc else None)

pandas/tslib.pyx

+14-2
Original file line numberDiff line numberDiff line change
@@ -1808,7 +1808,8 @@ cpdef object _get_rule_month(object source, object default='DEC'):
18081808

18091809
cpdef array_to_datetime(ndarray[object] values, raise_=False,
18101810
dayfirst=False, yearfirst=False, freq=None,
1811-
format=None, utc=None, coerce=False, unit=None):
1811+
format=None, utc=None, coerce=False, unit=None,
1812+
require_iso8601=False):
18121813
cdef:
18131814
Py_ssize_t i, n = len(values)
18141815
object val, py_dt
@@ -1908,6 +1909,17 @@ cpdef array_to_datetime(ndarray[object] values, raise_=False,
19081909
iresult[i] = value
19091910
_check_dts_bounds(&dts)
19101911
except ValueError:
1912+
# if requiring iso8601 strings, skip trying other formats
1913+
if require_iso8601:
1914+
if coerce:
1915+
iresult[i] = iNaT
1916+
continue
1917+
elif raise_:
1918+
raise ValueError("time data %r does match format specified" %
1919+
(val,))
1920+
else:
1921+
return values
1922+
19111923
try:
19121924
py_dt = parse_datetime_string(val, dayfirst=dayfirst,
19131925
yearfirst=yearfirst, freq=freq)
@@ -1971,7 +1983,7 @@ cpdef array_to_datetime(ndarray[object] values, raise_=False,
19711983
continue
19721984
try:
19731985
oresult[i] = parse_datetime_string(val, dayfirst=dayfirst,
1974-
yearfirst=yearfirst, freq=freq)
1986+
yearfirst=yearfirst, freq=freq)
19751987
_pydatetime_to_dts(oresult[i], &dts)
19761988
_check_dts_bounds(&dts)
19771989
except Exception:

vb_suite/timeseries.py

+4
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,10 @@ def date_range(start=None, end=None, periods=None, freq=None):
157157
Benchmark('to_datetime(strings)', setup,
158158
start_date=datetime(2012, 7, 11))
159159

160+
timeseries_to_datetime_iso8601_format = \
161+
Benchmark("to_datetime(strings, format='%Y-%m-%d %H:%M:%S')", setup,
162+
start_date=datetime(2012, 7, 11))
163+
160164
setup = common_setup + """
161165
rng = date_range('1/1/2000', periods=10000, freq='D')
162166
strings = Series(rng.year*10000+rng.month*100+rng.day,dtype=np.int64).apply(str)

0 commit comments

Comments
 (0)