Skip to content

Commit ea2489d

Browse files
committed
PERF: add exact kw to to_datetime to enable faster regex format parsing for datetimes (GH8904)
1 parent 526f33c commit ea2489d

File tree

5 files changed

+70
-19
lines changed

5 files changed

+70
-19
lines changed

doc/source/whatsnew/v0.15.2.txt

+2
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ Performance
8585
~~~~~~~~~~~
8686
- Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`)
8787

88+
- Performance boost for ``to_datetime`` conversions with a passed ``format=`` kw, and the new ``exact=False`` for kw with non-exact matching (:issue:`8904`)
89+
8890
.. _whatsnew_0152.experimental:
8991

9092
Experimental

pandas/tseries/tests/test_timeseries.py

+10
Original file line numberDiff line numberDiff line change
@@ -4123,6 +4123,16 @@ def test_to_datetime_format_time(self):
41234123
for s, format, dt in data:
41244124
self.assertEqual(to_datetime(s, format=format), dt)
41254125

4126+
def test_to_datetime_with_non_exact(self):
4127+
4128+
if sys.version_info < (2, 7):
4129+
raise nose.SkipTest('on python version < 2.7')
4130+
4131+
s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z']*10000)
4132+
result = to_datetime(s,format='%d%b%y',exact=False)
4133+
expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y')
4134+
assert_series_equal(result, expected)
4135+
41264136
def test_to_datetime_format_weeks(self):
41274137
data = [
41284138
['2009324', '%Y%W%w', Timestamp('2009-08-13')],

pandas/tseries/tools.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def _guess_datetime_format_for_array(arr, **kwargs):
174174
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
175175

176176
def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
177-
format=None, coerce=False, unit='ns',
177+
format=None, exact=True, coerce=False, unit='ns',
178178
infer_datetime_format=False):
179179
"""
180180
Convert argument to datetime.
@@ -195,6 +195,9 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
195195
If True returns a DatetimeIndex, if False returns ndarray of values
196196
format : string, default None
197197
strftime to parse time, eg "%d/%m/%Y"
198+
exact : boolean, True by default
199+
if True, require an exact format match
200+
if False, search for a matching format non-exclusive to the endpoints
198201
coerce : force errors to NaT (False by default)
199202
unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
200203
(e.g. a unix timestamp), which is an integer/float number
@@ -273,7 +276,7 @@ def _convert_listlike(arg, box, format):
273276
if result is None:
274277
try:
275278
result = tslib.array_strptime(
276-
arg, format, coerce=coerce
279+
arg, format, exact=exact, coerce=coerce
277280
)
278281
except (tslib.OutOfBoundsDatetime):
279282
if errors == 'raise':

pandas/tslib.pyx

+45-17
Original file line numberDiff line numberDiff line change
@@ -2123,13 +2123,24 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce):
21232123
raise ValueError("Invalid type for timedelta scalar: %s" % type(ts))
21242124
return ts.astype('timedelta64[ns]')
21252125

2126-
def array_strptime(ndarray[object] values, object fmt, coerce=False):
2126+
def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coerce=False):
2127+
"""
2128+
Parameters
2129+
----------
2130+
values : ndarray of string-like objects
2131+
fmt : string-like regex
2132+
exact : matches must be exact if True, search if False
2133+
coerce : if invalid values found, coerce to NaT
2134+
"""
2135+
21272136
cdef:
21282137
Py_ssize_t i, n = len(values)
21292138
pandas_datetimestruct dts
21302139
ndarray[int64_t] iresult
2131-
int year, month, day, minute, hour, second, fraction, weekday, julian
2132-
object val
2140+
int year, month, day, minute, hour, second, fraction, weekday, julian, tz
2141+
int week_of_year, week_of_year_start
2142+
object val, group_key, ampm, found
2143+
dict found_key
21332144

21342145
global _TimeRE_cache, _regex_cache
21352146
with _cache_lock:
@@ -2198,19 +2209,32 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
21982209
else:
21992210
val = str(val)
22002211

2201-
found = format_regex.match(val)
2202-
if not found:
2203-
if coerce:
2204-
iresult[i] = iNaT
2205-
continue
2206-
raise ValueError("time data %r does not match format %r" %
2207-
(values[i], fmt))
2208-
if len(val) != found.end():
2209-
if coerce:
2210-
iresult[i] = iNaT
2211-
continue
2212-
raise ValueError("unconverted data remains: %s" %
2213-
values[i][found.end():])
2212+
# exact matching
2213+
if exact:
2214+
found = format_regex.match(val)
2215+
if not found:
2216+
if coerce:
2217+
iresult[i] = iNaT
2218+
continue
2219+
raise ValueError("time data %r does not match format %r (match)" %
2220+
(values[i], fmt))
2221+
if len(val) != found.end():
2222+
if coerce:
2223+
iresult[i] = iNaT
2224+
continue
2225+
raise ValueError("unconverted data remains: %s" %
2226+
values[i][found.end():])
2227+
2228+
# search
2229+
else:
2230+
found = format_regex.search(val)
2231+
if not found:
2232+
if coerce:
2233+
iresult[i] = iNaT
2234+
continue
2235+
raise ValueError("time data %r does not match format %r (search)" %
2236+
(values[i], fmt))
2237+
22142238
year = 1900
22152239
month = day = 1
22162240
hour = minute = second = fraction = 0
@@ -4368,10 +4392,14 @@ _TimeRE_cache = TimeRE()
43684392
_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
43694393
_regex_cache = {}
43704394

4371-
def _calc_julian_from_U_or_W(year, week_of_year, day_of_week, week_starts_Mon):
4395+
cdef _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon):
43724396
"""Calculate the Julian day based on the year, week of the year, and day of
43734397
the week, with week_start_day representing whether the week of the year
43744398
assumes the week starts on Sunday or Monday (6 or 0)."""
4399+
4400+
cdef:
4401+
int first_weekday, week_0_length, days_to_week
4402+
43754403
first_weekday = datetime_date(year, 1, 1).weekday()
43764404
# If we are dealing with the %U directive (week starts on Sunday), it's
43774405
# easier to just shift the view to Sunday being the first day of the

vb_suite/timeseries.py

+8
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,14 @@ def date_range(start=None, end=None, periods=None, freq=None):
156156
Benchmark('to_datetime(strings,format="%Y%m%d")', setup,
157157
start_date=datetime(2012, 7, 1))
158158

159+
setup = common_setup + """
160+
s = Series(['19MAY11','19MAY11:00:00:00']*100000)
161+
"""
162+
timeseries_with_format = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
163+
setup, start_date=datetime(2014, 11, 26))
164+
timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \
165+
setup, start_date=datetime(2014, 11, 26))
166+
159167
# ---- infer_freq
160168
# infer_freq
161169

0 commit comments

Comments
 (0)