Skip to content

Commit 41dbca6

Browse files
committed
Merge pull request #4826 from jreback/dunit
PERF: much faster to_datetime performance with a format of '%Y%m%d'
2 parents 2f24d6e + 2c4356e commit 41dbca6

File tree

4 files changed

+47
-1
lines changed

4 files changed

+47
-1
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ Improvements to existing features
105105
test to vbench (:issue:`4705` and :issue:`4722`)
106106
- Add ``axis`` and ``level`` keywords to ``where``, so that the ``other`` argument
107107
can now be an alignable pandas object.
108+
- ``to_datetime`` with a format of 'YYYYMMDD' now parses much faster
108109

109110
API Changes
110111
~~~~~~~~~~~

pandas/tseries/tests/test_timeseries.py

+15
Original file line numberDiff line numberDiff line change
@@ -834,6 +834,21 @@ def test_to_datetime_format(self):
834834
else:
835835
self.assert_(result.equals(expected))
836836

837+
def test_to_datetime_format_YYYYMMDD(self):
838+
s = Series([19801222,19801222] + [19810105]*5)
839+
expected = Series([ Timestamp(x) for x in s.apply(str) ])
840+
841+
result = to_datetime(s,format='%Y%m%d')
842+
assert_series_equal(result, expected)
843+
844+
result = to_datetime(s.apply(str),format='%Y%m%d')
845+
assert_series_equal(result, expected)
846+
847+
# with NaT
848+
s[2] = np.nan
849+
self.assertRaises(ValueError, to_datetime, s,format='%Y%m%d')
850+
self.assertRaises(ValueError, to_datetime, s.apply(str),format='%Y%m%d')
851+
837852
def test_to_datetime_format_microsecond(self):
838853
val = '01-Apr-2011 00:00:01.978'
839854
format = '%d-%b-%Y %H:%M:%S.%f'

pandas/tseries/tools.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,19 @@ def _convert_listlike(arg, box):
101101
arg = com._ensure_object(arg)
102102
try:
103103
if format is not None:
104-
result = tslib.array_strptime(arg, format)
104+
result = None
105+
106+
# shortcut formatting here
107+
if format == '%Y%m%d':
108+
try:
109+
carg = arg.astype(np.int64).astype(object)
110+
result = lib.try_parse_year_month_day(carg/10000,carg/100 % 100, carg % 100)
111+
except:
112+
raise ValueError("cannot convert the input to '%Y%m%d' date format")
113+
114+
# fallback
115+
if result is None:
116+
result = tslib.array_strptime(arg, format)
105117
else:
106118
result = tslib.array_to_datetime(arg, raise_=errors == 'raise',
107119
utc=utc, dayfirst=dayfirst,

vb_suite/timeseries.py

+18
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,24 @@ def date_range(start=None, end=None, periods=None, freq=None):
147147
Benchmark('to_datetime(strings)', setup,
148148
start_date=datetime(2012, 7, 11))
149149

150+
setup = common_setup + """
151+
rng = date_range('1/1/2000', periods=10000, freq='D')
152+
strings = Series(rng.year*10000+rng.month*100+rng.day,dtype=np.int64).apply(str)
153+
"""
154+
155+
timeseries_to_datetime_YYYYMMDD = \
156+
Benchmark('to_datetime(strings,format="%Y%m%d")', setup,
157+
start_date=datetime(2013, 9, 1))
158+
159+
setup = common_setup + """
160+
rng = date_range('1/1/2000', periods=10000, freq='D')
161+
strings = Series(rng.year*10000+rng.month*100+rng.day,dtype=np.int64).apply(str)
162+
"""
163+
164+
timeseries_to_datetime_YYYYMMDD_old = \
165+
Benchmark('pandas.tslib.array_strptime(strings.values,"%Y%m%d")', setup,
166+
start_date=datetime(2013, 9, 1))
167+
150168
# ---- infer_freq
151169
# infer_freq
152170

0 commit comments

Comments
 (0)