Skip to content

PERF: add exact kw to to_datetime to enable faster regex format parsing #8904

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 5, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v0.15.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,16 @@ Enhancements
- Added ``gbq.generate_bq_schema`` function to the gbq module (:issue:`8325`).
- ``Series`` now works with map objects the same way as generators (:issue:`8909`).
- Added context manager to ``HDFStore`` for automatic closing (:issue:`8791`).
- ``to_datetime`` gains an ``exact`` keyword to allow for a format to not require an exact match for a provided format string (if its ``False). ``exact`` defaults to ``True`` (meaning that exact matching is still the default) (:issue:`8904`)

.. _whatsnew_0152.performance:

Performance
~~~~~~~~~~~
- Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`)

- Performance boost for ``to_datetime`` conversions with a passed ``format=``, and the ``exact=False`` (:issue:`8904`)

.. _whatsnew_0152.experimental:

Experimental
Expand Down Expand Up @@ -141,6 +144,7 @@ Bug Fixes

- Report a ``TypeError`` when invalid/no paramaters are passed in a groupby (:issue:`8015`)
- Regression in DatetimeIndex iteration with a Fixed/Local offset timezone (:issue:`8890`)
- Bug in ``to_datetime`` when parsing a nanoseconds using the ``%f`` format (:issue:`8989`)
- Bug in packaging pandas with ``py2app/cx_Freeze`` (:issue:`8602`, :issue:`8831`)
- Bug in ``groupby`` signatures that didn't include \*args or \*\*kwargs (:issue:`8733`).
- ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo and when it receives no data from Yahoo (:issue:`8761`), (:issue:`8783`).
Expand Down
26 changes: 26 additions & 0 deletions pandas/tseries/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4123,6 +4123,32 @@ def test_to_datetime_format_time(self):
for s, format, dt in data:
self.assertEqual(to_datetime(s, format=format), dt)

def test_to_datetime_with_non_exact(self):

# 8904
# exact kw
if sys.version_info < (2, 7):
raise nose.SkipTest('on python version < 2.7')

s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z'])
result = to_datetime(s,format='%d%b%y',exact=False)
expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y')
assert_series_equal(result, expected)

def test_parse_nanoseconds_with_formula(self):

# GH8989
# trunctaing the nanoseconds when a format was provided
for v in ["2012-01-01 09:00:00.000000001",
"2012-01-01 09:00:00.000001",
"2012-01-01 09:00:00.001",
"2012-01-01 09:00:00.001000",
"2012-01-01 09:00:00.001000000",
]:
expected = pd.to_datetime(v)
result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f")
self.assertEqual(result,expected)

def test_to_datetime_format_weeks(self):
data = [
['2009324', '%Y%W%w', Timestamp('2009-08-13')],
Expand Down
10 changes: 7 additions & 3 deletions pandas/tseries/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def _guess_datetime_format_for_array(arr, **kwargs):
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)

def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
format=None, coerce=False, unit='ns',
format=None, exact=True, coerce=False, unit='ns',
infer_datetime_format=False):
"""
Convert argument to datetime.
Expand All @@ -194,7 +194,11 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
box : boolean, default True
If True returns a DatetimeIndex, if False returns ndarray of values
format : string, default None
strftime to parse time, eg "%d/%m/%Y"
strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
all the way up to nanoseconds
exact : boolean, True by default
If True, require an exact format match.
If False, allow the format to match anywhere in the target string.
coerce : force errors to NaT (False by default)
unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
(e.g. a unix timestamp), which is an integer/float number
Expand Down Expand Up @@ -273,7 +277,7 @@ def _convert_listlike(arg, box, format):
if result is None:
try:
result = tslib.array_strptime(
arg, format, coerce=coerce
arg, format, exact=exact, coerce=coerce
)
except (tslib.OutOfBoundsDatetime):
if errors == 'raise':
Expand Down
78 changes: 55 additions & 23 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2123,13 +2123,25 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce):
raise ValueError("Invalid type for timedelta scalar: %s" % type(ts))
return ts.astype('timedelta64[ns]')

def array_strptime(ndarray[object] values, object fmt, coerce=False):
def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coerce=False):
"""
Parameters
----------
values : ndarray of string-like objects
fmt : string-like regex
exact : matches must be exact if True, search if False
coerce : if invalid values found, coerce to NaT
"""

cdef:
Py_ssize_t i, n = len(values)
pandas_datetimestruct dts
ndarray[int64_t] iresult
int year, month, day, minute, hour, second, fraction, weekday, julian
object val
int year, month, day, minute, hour, second, weekday, julian, tz
int week_of_year, week_of_year_start
int64_t us, ns
object val, group_key, ampm, found
dict found_key

global _TimeRE_cache, _regex_cache
with _cache_lock:
Expand Down Expand Up @@ -2198,22 +2210,35 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
else:
val = str(val)

found = format_regex.match(val)
if not found:
if coerce:
iresult[i] = iNaT
continue
raise ValueError("time data %r does not match format %r" %
(values[i], fmt))
if len(val) != found.end():
if coerce:
iresult[i] = iNaT
continue
raise ValueError("unconverted data remains: %s" %
values[i][found.end():])
# exact matching
if exact:
found = format_regex.match(val)
if not found:
if coerce:
iresult[i] = iNaT
continue
raise ValueError("time data %r does not match format %r (match)" %
(values[i], fmt))
if len(val) != found.end():
if coerce:
iresult[i] = iNaT
continue
raise ValueError("unconverted data remains: %s" %
values[i][found.end():])

# search
else:
found = format_regex.search(val)
if not found:
if coerce:
iresult[i] = iNaT
continue
raise ValueError("time data %r does not match format %r (search)" %
(values[i], fmt))

year = 1900
month = day = 1
hour = minute = second = fraction = 0
hour = minute = second = ns = us = 0
tz = -1
# Default to -1 to signify that values not known; not critical to have,
# though
Expand Down Expand Up @@ -2278,9 +2303,11 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
second = int(found_dict['S'])
elif parse_code == 10:
s = found_dict['f']
# Pad to always return microseconds.
s += "0" * (6 - len(s))
fraction = int(s)
# Pad to always return nanoseconds
s += "0" * (9 - len(s))
us = long(s)
ns = us % 1000
us = us / 1000
elif parse_code == 11:
weekday = locale_time.f_weekday.index(found_dict['A'].lower())
elif parse_code == 12:
Expand Down Expand Up @@ -2345,7 +2372,8 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
dts.hour = hour
dts.min = minute
dts.sec = second
dts.us = fraction
dts.us = us
dts.ps = ns * 1000

iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts)
try:
Expand Down Expand Up @@ -4287,7 +4315,7 @@ class TimeRE(dict):
base.__init__({
# The " \d" part of the regex is to make %c from ANSI C work
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
'f': r"(?P<f>[0-9]{1,6})",
'f': r"(?P<f>[0-9]{1,9})",
'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
Expand Down Expand Up @@ -4368,10 +4396,14 @@ _TimeRE_cache = TimeRE()
_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
_regex_cache = {}

def _calc_julian_from_U_or_W(year, week_of_year, day_of_week, week_starts_Mon):
cdef _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon):
"""Calculate the Julian day based on the year, week of the year, and day of
the week, with week_start_day representing whether the week of the year
assumes the week starts on Sunday or Monday (6 or 0)."""

cdef:
int first_weekday, week_0_length, days_to_week

first_weekday = datetime_date(year, 1, 1).weekday()
# If we are dealing with the %U directive (week starts on Sunday), it's
# easier to just shift the view to Sunday being the first day of the
Expand Down
8 changes: 8 additions & 0 deletions vb_suite/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,14 @@ def date_range(start=None, end=None, periods=None, freq=None):
Benchmark('to_datetime(strings,format="%Y%m%d")', setup,
start_date=datetime(2012, 7, 1))

setup = common_setup + """
s = Series(['19MAY11','19MAY11:00:00:00']*100000)
"""
timeseries_with_format_no_exact = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
setup, start_date=datetime(2014, 11, 26))
timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \
setup, start_date=datetime(2014, 11, 26))

# ---- infer_freq
# infer_freq

Expand Down