From ea2489d54689b47a60110c34eebd4298297df961 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 26 Nov 2014 20:38:15 -0500 Subject: [PATCH 1/2] PERF: add exact kw to to_datetime to enable faster regex format parsing for datetimes (GH8904) --- doc/source/whatsnew/v0.15.2.txt | 2 + pandas/tseries/tests/test_timeseries.py | 10 ++++ pandas/tseries/tools.py | 7 ++- pandas/tslib.pyx | 62 ++++++++++++++++++------- vb_suite/timeseries.py | 8 ++++ 5 files changed, 70 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 10b23605cca85..34f359fe6a54c 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -85,6 +85,8 @@ Performance ~~~~~~~~~~~ - Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`) +- Performance boost for ``to_datetime`` conversions with a passed ``format=`` kw, and the new ``exact=False`` for kw with non-exact matching (:issue:`8904`) + .. _whatsnew_0152.experimental: Experimental diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 12e10a71c67b2..ae68cdabd74c7 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4123,6 +4123,16 @@ def test_to_datetime_format_time(self): for s, format, dt in data: self.assertEqual(to_datetime(s, format=format), dt) + def test_to_datetime_with_non_exact(self): + + if sys.version_info < (2, 7): + raise nose.SkipTest('on python version < 2.7') + + s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z']*10000) + result = to_datetime(s,format='%d%b%y',exact=False) + expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y') + assert_series_equal(result, expected) + def test_to_datetime_format_weeks(self): data = [ ['2009324', '%Y%W%w', Timestamp('2009-08-13')], diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index f29ab14ed8745..d556df9280d3f 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -174,7 +174,7 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, - format=None, coerce=False, unit='ns', + format=None, exact=True, coerce=False, unit='ns', infer_datetime_format=False): """ Convert argument to datetime. @@ -195,6 +195,9 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, If True returns a DatetimeIndex, if False returns ndarray of values format : string, default None strftime to parse time, eg "%d/%m/%Y" + exact : boolean, True by default + if True, require an exact format match + if False, search for a matching format non-exclusive to the endpoints coerce : force errors to NaT (False by default) unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number @@ -273,7 +276,7 @@ def _convert_listlike(arg, box, format): if result is None: try: result = tslib.array_strptime( - arg, format, coerce=coerce + arg, format, exact=exact, coerce=coerce ) except (tslib.OutOfBoundsDatetime): if errors == 'raise': diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index d7c5d656d71e0..d4f14992949d3 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2123,13 +2123,24 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce): raise ValueError("Invalid type for timedelta scalar: %s" % type(ts)) return ts.astype('timedelta64[ns]') -def array_strptime(ndarray[object] values, object fmt, coerce=False): +def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coerce=False): + """ + Parameters + ---------- + values : ndarray of string-like objects + fmt : string-like regex + exact : matches must be exact if True, search if False + coerce : if invalid values found, coerce to NaT + """ + cdef: Py_ssize_t i, n = len(values) pandas_datetimestruct dts ndarray[int64_t] iresult - int year, month, day, minute, hour, second, fraction, weekday, julian - object val + int year, month, day, minute, hour, second, fraction, weekday, julian, tz + int week_of_year, week_of_year_start + object val, group_key, ampm, found + dict found_key global _TimeRE_cache, _regex_cache with _cache_lock: @@ -2198,19 +2209,32 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False): else: val = str(val) - found = format_regex.match(val) - if not found: - if coerce: - iresult[i] = iNaT - continue - raise ValueError("time data %r does not match format %r" % - (values[i], fmt)) - if len(val) != found.end(): - if coerce: - iresult[i] = iNaT - continue - raise ValueError("unconverted data remains: %s" % - values[i][found.end():]) + # exact matching + if exact: + found = format_regex.match(val) + if not found: + if coerce: + iresult[i] = iNaT + continue + raise ValueError("time data %r does not match format %r (match)" % + (values[i], fmt)) + if len(val) != found.end(): + if coerce: + iresult[i] = iNaT + continue + raise ValueError("unconverted data remains: %s" % + values[i][found.end():]) + + # search + else: + found = format_regex.search(val) + if not found: + if coerce: + iresult[i] = iNaT + continue + raise ValueError("time data %r does not match format %r (search)" % + (values[i], fmt)) + year = 1900 month = day = 1 hour = minute = second = fraction = 0 @@ -4368,10 +4392,14 @@ _TimeRE_cache = TimeRE() _CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache _regex_cache = {} -def _calc_julian_from_U_or_W(year, week_of_year, day_of_week, week_starts_Mon): +cdef _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon): """Calculate the Julian day based on the year, week of the year, and day of the week, with week_start_day representing whether the week of the year assumes the week starts on Sunday or Monday (6 or 0).""" + + cdef: + int first_weekday, week_0_length, days_to_week + first_weekday = datetime_date(year, 1, 1).weekday() # If we are dealing with the %U directive (week starts on Sunday), it's # easier to just shift the view to Sunday being the first day of the diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index c67cdabdc1a06..1bcd2de5dbefe 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -156,6 +156,14 @@ def date_range(start=None, end=None, periods=None, freq=None): Benchmark('to_datetime(strings,format="%Y%m%d")', setup, start_date=datetime(2012, 7, 1)) +setup = common_setup + """ +s = Series(['19MAY11','19MAY11:00:00:00']*100000) +""" +timeseries_with_format = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \ + setup, start_date=datetime(2014, 11, 26)) +timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \ + setup, start_date=datetime(2014, 11, 26)) + # ---- infer_freq # infer_freq From d6e43373db5ae0201f84254fe4d7ab7684835cb6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 3 Dec 2014 18:53:03 -0500 Subject: [PATCH 2/2] BUG: fix GH8989 to parse nanoseconds with %f format --- doc/source/whatsnew/v0.15.2.txt | 4 +++- pandas/tseries/tests/test_timeseries.py | 18 +++++++++++++++++- pandas/tseries/tools.py | 7 ++++--- pandas/tslib.pyx | 18 +++++++++++------- vb_suite/timeseries.py | 2 +- 5 files changed, 36 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 34f359fe6a54c..7e8a4c7ba4faf 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -78,6 +78,7 @@ Enhancements - Added ``gbq.generate_bq_schema`` function to the gbq module (:issue:`8325`). - ``Series`` now works with map objects the same way as generators (:issue:`8909`). - Added context manager to ``HDFStore`` for automatic closing (:issue:`8791`). +- ``to_datetime`` gains an ``exact`` keyword to allow for a format to not require an exact match for a provided format string (if its ``False). ``exact`` defaults to ``True`` (meaning that exact matching is still the default) (:issue:`8904`) .. _whatsnew_0152.performance: @@ -85,7 +86,7 @@ Performance ~~~~~~~~~~~ - Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`) -- Performance boost for ``to_datetime`` conversions with a passed ``format=`` kw, and the new ``exact=False`` for kw with non-exact matching (:issue:`8904`) +- Performance boost for ``to_datetime`` conversions with a passed ``format=``, and the ``exact=False`` (:issue:`8904`) .. _whatsnew_0152.experimental: @@ -143,6 +144,7 @@ Bug Fixes - Report a ``TypeError`` when invalid/no paramaters are passed in a groupby (:issue:`8015`) - Regression in DatetimeIndex iteration with a Fixed/Local offset timezone (:issue:`8890`) +- Bug in ``to_datetime`` when parsing a nanoseconds using the ``%f`` format (:issue:`8989`) - Bug in packaging pandas with ``py2app/cx_Freeze`` (:issue:`8602`, :issue:`8831`) - Bug in ``groupby`` signatures that didn't include \*args or \*\*kwargs (:issue:`8733`). - ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo and when it receives no data from Yahoo (:issue:`8761`), (:issue:`8783`). diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index ae68cdabd74c7..7a428fd629125 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4125,14 +4125,30 @@ def test_to_datetime_format_time(self): def test_to_datetime_with_non_exact(self): + # 8904 + # exact kw if sys.version_info < (2, 7): raise nose.SkipTest('on python version < 2.7') - s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z']*10000) + s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z']) result = to_datetime(s,format='%d%b%y',exact=False) expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y') assert_series_equal(result, expected) + def test_parse_nanoseconds_with_formula(self): + + # GH8989 + # trunctaing the nanoseconds when a format was provided + for v in ["2012-01-01 09:00:00.000000001", + "2012-01-01 09:00:00.000001", + "2012-01-01 09:00:00.001", + "2012-01-01 09:00:00.001000", + "2012-01-01 09:00:00.001000000", + ]: + expected = pd.to_datetime(v) + result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f") + self.assertEqual(result,expected) + def test_to_datetime_format_weeks(self): data = [ ['2009324', '%Y%W%w', Timestamp('2009-08-13')], diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index d556df9280d3f..e680fa06a9c8e 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -194,10 +194,11 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, box : boolean, default True If True returns a DatetimeIndex, if False returns ndarray of values format : string, default None - strftime to parse time, eg "%d/%m/%Y" + strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse + all the way up to nanoseconds exact : boolean, True by default - if True, require an exact format match - if False, search for a matching format non-exclusive to the endpoints + If True, require an exact format match. + If False, allow the format to match anywhere in the target string. coerce : force errors to NaT (False by default) unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index d4f14992949d3..4cb6c93bdf3d0 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2137,8 +2137,9 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe Py_ssize_t i, n = len(values) pandas_datetimestruct dts ndarray[int64_t] iresult - int year, month, day, minute, hour, second, fraction, weekday, julian, tz + int year, month, day, minute, hour, second, weekday, julian, tz int week_of_year, week_of_year_start + int64_t us, ns object val, group_key, ampm, found dict found_key @@ -2237,7 +2238,7 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe year = 1900 month = day = 1 - hour = minute = second = fraction = 0 + hour = minute = second = ns = us = 0 tz = -1 # Default to -1 to signify that values not known; not critical to have, # though @@ -2302,9 +2303,11 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe second = int(found_dict['S']) elif parse_code == 10: s = found_dict['f'] - # Pad to always return microseconds. - s += "0" * (6 - len(s)) - fraction = int(s) + # Pad to always return nanoseconds + s += "0" * (9 - len(s)) + us = long(s) + ns = us % 1000 + us = us / 1000 elif parse_code == 11: weekday = locale_time.f_weekday.index(found_dict['A'].lower()) elif parse_code == 12: @@ -2369,7 +2372,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe dts.hour = hour dts.min = minute dts.sec = second - dts.us = fraction + dts.us = us + dts.ps = ns * 1000 iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) try: @@ -4311,7 +4315,7 @@ class TimeRE(dict): base.__init__({ # The " \d" part of the regex is to make %c from ANSI C work 'd': r"(?P3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", - 'f': r"(?P[0-9]{1,6})", + 'f': r"(?P[0-9]{1,9})", 'H': r"(?P2[0-3]|[0-1]\d|\d)", 'I': r"(?P1[0-2]|0[1-9]|[1-9])", 'j': r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])", diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index 1bcd2de5dbefe..f0c3961ae0277 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -159,7 +159,7 @@ def date_range(start=None, end=None, periods=None, freq=None): setup = common_setup + """ s = Series(['19MAY11','19MAY11:00:00:00']*100000) """ -timeseries_with_format = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \ +timeseries_with_format_no_exact = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \ setup, start_date=datetime(2014, 11, 26)) timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \ setup, start_date=datetime(2014, 11, 26))