diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 10b23605cca85..7e8a4c7ba4faf 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -78,6 +78,7 @@ Enhancements - Added ``gbq.generate_bq_schema`` function to the gbq module (:issue:`8325`). - ``Series`` now works with map objects the same way as generators (:issue:`8909`). - Added context manager to ``HDFStore`` for automatic closing (:issue:`8791`). +- ``to_datetime`` gains an ``exact`` keyword to allow for a format to not require an exact match for a provided format string (if its ``False). ``exact`` defaults to ``True`` (meaning that exact matching is still the default) (:issue:`8904`) .. _whatsnew_0152.performance: @@ -85,6 +86,8 @@ Performance ~~~~~~~~~~~ - Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`) +- Performance boost for ``to_datetime`` conversions with a passed ``format=``, and the ``exact=False`` (:issue:`8904`) + .. _whatsnew_0152.experimental: Experimental @@ -141,6 +144,7 @@ Bug Fixes - Report a ``TypeError`` when invalid/no paramaters are passed in a groupby (:issue:`8015`) - Regression in DatetimeIndex iteration with a Fixed/Local offset timezone (:issue:`8890`) +- Bug in ``to_datetime`` when parsing a nanoseconds using the ``%f`` format (:issue:`8989`) - Bug in packaging pandas with ``py2app/cx_Freeze`` (:issue:`8602`, :issue:`8831`) - Bug in ``groupby`` signatures that didn't include \*args or \*\*kwargs (:issue:`8733`). - ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo and when it receives no data from Yahoo (:issue:`8761`), (:issue:`8783`). diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 12e10a71c67b2..7a428fd629125 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -4123,6 +4123,32 @@ def test_to_datetime_format_time(self): for s, format, dt in data: self.assertEqual(to_datetime(s, format=format), dt) + def test_to_datetime_with_non_exact(self): + + # 8904 + # exact kw + if sys.version_info < (2, 7): + raise nose.SkipTest('on python version < 2.7') + + s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z']) + result = to_datetime(s,format='%d%b%y',exact=False) + expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y') + assert_series_equal(result, expected) + + def test_parse_nanoseconds_with_formula(self): + + # GH8989 + # trunctaing the nanoseconds when a format was provided + for v in ["2012-01-01 09:00:00.000000001", + "2012-01-01 09:00:00.000001", + "2012-01-01 09:00:00.001", + "2012-01-01 09:00:00.001000", + "2012-01-01 09:00:00.001000000", + ]: + expected = pd.to_datetime(v) + result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f") + self.assertEqual(result,expected) + def test_to_datetime_format_weeks(self): data = [ ['2009324', '%Y%W%w', Timestamp('2009-08-13')], diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index f29ab14ed8745..e680fa06a9c8e 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -174,7 +174,7 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, - format=None, coerce=False, unit='ns', + format=None, exact=True, coerce=False, unit='ns', infer_datetime_format=False): """ Convert argument to datetime. @@ -194,7 +194,11 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, box : boolean, default True If True returns a DatetimeIndex, if False returns ndarray of values format : string, default None - strftime to parse time, eg "%d/%m/%Y" + strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse + all the way up to nanoseconds + exact : boolean, True by default + If True, require an exact format match. + If False, allow the format to match anywhere in the target string. coerce : force errors to NaT (False by default) unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number @@ -273,7 +277,7 @@ def _convert_listlike(arg, box, format): if result is None: try: result = tslib.array_strptime( - arg, format, coerce=coerce + arg, format, exact=exact, coerce=coerce ) except (tslib.OutOfBoundsDatetime): if errors == 'raise': diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index d7c5d656d71e0..4cb6c93bdf3d0 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2123,13 +2123,25 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce): raise ValueError("Invalid type for timedelta scalar: %s" % type(ts)) return ts.astype('timedelta64[ns]') -def array_strptime(ndarray[object] values, object fmt, coerce=False): +def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coerce=False): + """ + Parameters + ---------- + values : ndarray of string-like objects + fmt : string-like regex + exact : matches must be exact if True, search if False + coerce : if invalid values found, coerce to NaT + """ + cdef: Py_ssize_t i, n = len(values) pandas_datetimestruct dts ndarray[int64_t] iresult - int year, month, day, minute, hour, second, fraction, weekday, julian - object val + int year, month, day, minute, hour, second, weekday, julian, tz + int week_of_year, week_of_year_start + int64_t us, ns + object val, group_key, ampm, found + dict found_key global _TimeRE_cache, _regex_cache with _cache_lock: @@ -2198,22 +2210,35 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False): else: val = str(val) - found = format_regex.match(val) - if not found: - if coerce: - iresult[i] = iNaT - continue - raise ValueError("time data %r does not match format %r" % - (values[i], fmt)) - if len(val) != found.end(): - if coerce: - iresult[i] = iNaT - continue - raise ValueError("unconverted data remains: %s" % - values[i][found.end():]) + # exact matching + if exact: + found = format_regex.match(val) + if not found: + if coerce: + iresult[i] = iNaT + continue + raise ValueError("time data %r does not match format %r (match)" % + (values[i], fmt)) + if len(val) != found.end(): + if coerce: + iresult[i] = iNaT + continue + raise ValueError("unconverted data remains: %s" % + values[i][found.end():]) + + # search + else: + found = format_regex.search(val) + if not found: + if coerce: + iresult[i] = iNaT + continue + raise ValueError("time data %r does not match format %r (search)" % + (values[i], fmt)) + year = 1900 month = day = 1 - hour = minute = second = fraction = 0 + hour = minute = second = ns = us = 0 tz = -1 # Default to -1 to signify that values not known; not critical to have, # though @@ -2278,9 +2303,11 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False): second = int(found_dict['S']) elif parse_code == 10: s = found_dict['f'] - # Pad to always return microseconds. - s += "0" * (6 - len(s)) - fraction = int(s) + # Pad to always return nanoseconds + s += "0" * (9 - len(s)) + us = long(s) + ns = us % 1000 + us = us / 1000 elif parse_code == 11: weekday = locale_time.f_weekday.index(found_dict['A'].lower()) elif parse_code == 12: @@ -2345,7 +2372,8 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False): dts.hour = hour dts.min = minute dts.sec = second - dts.us = fraction + dts.us = us + dts.ps = ns * 1000 iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) try: @@ -4287,7 +4315,7 @@ class TimeRE(dict): base.__init__({ # The " \d" part of the regex is to make %c from ANSI C work 'd': r"(?P3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", - 'f': r"(?P[0-9]{1,6})", + 'f': r"(?P[0-9]{1,9})", 'H': r"(?P2[0-3]|[0-1]\d|\d)", 'I': r"(?P1[0-2]|0[1-9]|[1-9])", 'j': r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])", @@ -4368,10 +4396,14 @@ _TimeRE_cache = TimeRE() _CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache _regex_cache = {} -def _calc_julian_from_U_or_W(year, week_of_year, day_of_week, week_starts_Mon): +cdef _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon): """Calculate the Julian day based on the year, week of the year, and day of the week, with week_start_day representing whether the week of the year assumes the week starts on Sunday or Monday (6 or 0).""" + + cdef: + int first_weekday, week_0_length, days_to_week + first_weekday = datetime_date(year, 1, 1).weekday() # If we are dealing with the %U directive (week starts on Sunday), it's # easier to just shift the view to Sunday being the first day of the diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index c67cdabdc1a06..f0c3961ae0277 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -156,6 +156,14 @@ def date_range(start=None, end=None, periods=None, freq=None): Benchmark('to_datetime(strings,format="%Y%m%d")', setup, start_date=datetime(2012, 7, 1)) +setup = common_setup + """ +s = Series(['19MAY11','19MAY11:00:00:00']*100000) +""" +timeseries_with_format_no_exact = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \ + setup, start_date=datetime(2014, 11, 26)) +timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \ + setup, start_date=datetime(2014, 11, 26)) + # ---- infer_freq # infer_freq