diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 943f925ec5b04..725da22104efc 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -37,10 +37,10 @@ from tslibs.timezones cimport ( is_utc, is_tzlocal, get_utcoffset, get_dst_info, maybe_get_tz) from tslib cimport _nat_scalar_rules +from tslibs.parsing import parse_time_string, NAT_SENTINEL from tslibs.frequencies cimport get_freq_code from pandas.tseries import offsets -from pandas.core.tools.datetimes import parse_time_string from pandas.tseries import frequencies cdef int64_t NPY_NAT = util.get_nat() @@ -1197,6 +1197,8 @@ class Period(_Period): value = str(value) value = value.upper() dt, _, reso = parse_time_string(value, freq) + if dt is NAT_SENTINEL: + ordinal = iNaT if freq is None: try: diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index a2764e87eec55..ed883bf5db5bc 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -1384,165 +1384,6 @@ def convert_sql_column(x): return maybe_convert_objects(x, try_float=1) -def try_parse_dates(ndarray[object] values, parser=None, - dayfirst=False, default=None): - cdef: - Py_ssize_t i, n - ndarray[object] result - - n = len(values) - result = np.empty(n, dtype='O') - - if parser is None: - if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year, date.month, 1) - - try: - from dateutil.parser import parse - parse_date = lambda x: parse(x, dayfirst=dayfirst, default=default) - except ImportError: # pragma: no cover - def parse_date(s): - try: - return datetime.strptime(s, '%m/%d/%Y') - except Exception: - return s - # EAFP here - try: - for i from 0 <= i < n: - if values[i] == '': - result[i] = np.nan - else: - result[i] = parse_date(values[i]) - except Exception: - # failed - return values - else: - parse_date = parser - - try: - for i from 0 <= i < n: - if values[i] == '': - result[i] = np.nan - else: - result[i] = parse_date(values[i]) - except Exception: - # raise if passed parser and it failed - raise - - return result - - -def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, - date_parser=None, time_parser=None, - dayfirst=False, default=None): - cdef: - Py_ssize_t i, n - ndarray[object] result - - from datetime import date, time, datetime, timedelta - - n = len(dates) - if len(times) != n: - raise ValueError('Length of dates and times must be equal') - result = np.empty(n, dtype='O') - - if date_parser is None: - if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year, date.month, 1) - - try: - from dateutil.parser import parse - parse_date = lambda x: parse(x, dayfirst=dayfirst, default=default) - except ImportError: # pragma: no cover - def parse_date(s): - try: - return date.strptime(s, '%m/%d/%Y') - except Exception: - return s - else: - parse_date = date_parser - - if time_parser is None: - try: - from dateutil.parser import parse - parse_time = lambda x: parse(x) - except ImportError: # pragma: no cover - def parse_time(s): - try: - return time.strptime(s, '%H:%M:%S') - except Exception: - return s - - else: - parse_time = time_parser - - for i from 0 <= i < n: - d = parse_date(str(dates[i])) - t = parse_time(str(times[i])) - result[i] = datetime(d.year, d.month, d.day, - t.hour, t.minute, t.second) - - return result - - -def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, - ndarray[object] days): - cdef: - Py_ssize_t i, n - ndarray[object] result - - from datetime import datetime - - n = len(years) - if len(months) != n or len(days) != n: - raise ValueError('Length of years/months/days must all be equal') - result = np.empty(n, dtype='O') - - for i from 0 <= i < n: - result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) - - return result - - -def try_parse_datetime_components(ndarray[object] years, - ndarray[object] months, - ndarray[object] days, - ndarray[object] hours, - ndarray[object] minutes, - ndarray[object] seconds): - - cdef: - Py_ssize_t i, n - ndarray[object] result - int secs - double float_secs - double micros - - from datetime import datetime - - n = len(years) - if (len(months) != n or len(days) != n or len(hours) != n or - len(minutes) != n or len(seconds) != n): - raise ValueError('Length of all datetime components must be equal') - result = np.empty(n, dtype='O') - - for i from 0 <= i < n: - float_secs = float(seconds[i]) - secs = int(float_secs) - - micros = float_secs - secs - if micros > 0: - micros = micros * 1000000 - - result[i] = datetime(int(years[i]), int(months[i]), int(days[i]), - int(hours[i]), int(minutes[i]), secs, - int(micros)) - - return result - - def sanitize_objects(ndarray[object] values, set na_values, convert_empty=True): cdef: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d4ca5af09367e..4c34d0fcb1e5f 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -67,6 +67,9 @@ from khash cimport ( kh_init_int64, kh_int64_t, kh_resize_int64, kh_get_int64) +from .tslibs.parsing import parse_datetime_string +from .tslibs.parsing import DateParseError # noqa + cimport cython import re @@ -1737,26 +1740,6 @@ def datetime_to_datetime64(ndarray[object] values): return result, inferred_tz -cdef: - set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) - -cpdef bint _does_string_look_like_datetime(object date_string): - if date_string.startswith('0'): - # Strings starting with 0 are more consistent with a - # date-like string than a number - return True - - try: - if float(date_string) < 1000: - return False - except ValueError: - pass - - if date_string in _not_datelike_strings: - return False - - return True - def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object format=None, object na_rep=None): @@ -1841,257 +1824,6 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, return result -class DateParseError(ValueError): - pass - - -cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') - - -def parse_datetime_string(object date_string, object freq=None, - dayfirst=False, yearfirst=False, **kwargs): - """parse datetime string, only returns datetime. - Also cares special handling matching time patterns. - - Returns - ------- - datetime - """ - - cdef: - object dt - - if not _does_string_look_like_datetime(date_string): - raise ValueError('Given date string not likely a datetime.') - - if _TIMEPAT.match(date_string): - # use current datetime as default, not pass _DEFAULT_DATETIME - dt = parse_date(date_string, dayfirst=dayfirst, - yearfirst=yearfirst, **kwargs) - return dt - try: - dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) - return dt - except DateParseError: - raise - except ValueError: - pass - - try: - dt = parse_date(date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) - except TypeError: - # following may be raised from dateutil - # TypeError: 'NoneType' object is not iterable - raise ValueError('Given date string not likely a datetime.') - - return dt - - -def parse_datetime_string_with_reso(object date_string, object freq=None, - dayfirst=False, yearfirst=False, **kwargs): - """parse datetime string, only returns datetime - - Returns - ------- - datetime - """ - - cdef: - object parsed, reso - - if not _does_string_look_like_datetime(date_string): - raise ValueError('Given date string not likely a datetime.') - - try: - return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) - except DateParseError: - raise - except ValueError: - pass - - try: - parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst) - except Exception as e: - # TODO: allow raise of errors within instead - raise DateParseError(e) - if parsed is None: - raise DateParseError("Could not parse %s" % date_string) - return parsed, parsed, reso - - -cdef inline object _parse_dateabbr_string(object date_string, object default, - object freq): - cdef: - object ret - int year, quarter = -1, month, mnum, date_len - - # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 - assert util.is_string_object(date_string) - - # len(date_string) == 0 - # should be NaT??? - - if date_string in _nat_strings: - return NaT, NaT, '' - - date_string = date_string.upper() - date_len = len(date_string) - - if date_len == 4: - # parse year only like 2000 - try: - ret = default.replace(year=int(date_string)) - return ret, ret, 'year' - except ValueError: - pass - - try: - if 4 <= date_len <= 7: - i = date_string.index('Q', 1, 6) - if i == 1: - quarter = int(date_string[0]) - if date_len == 4 or (date_len == 5 - and date_string[i + 1] == '-'): - # r'(\d)Q-?(\d\d)') - year = 2000 + int(date_string[-2:]) - elif date_len == 6 or (date_len == 7 - and date_string[i + 1] == '-'): - # r'(\d)Q-?(\d\d\d\d)') - year = int(date_string[-4:]) - else: - raise ValueError - elif i == 2 or i == 3: - # r'(\d\d)-?Q(\d)' - if date_len == 4 or (date_len == 5 - and date_string[i - 1] == '-'): - quarter = int(date_string[-1]) - year = 2000 + int(date_string[:2]) - else: - raise ValueError - elif i == 4 or i == 5: - if date_len == 6 or (date_len == 7 - and date_string[i - 1] == '-'): - # r'(\d\d\d\d)-?Q(\d)' - quarter = int(date_string[-1]) - year = int(date_string[:4]) - else: - raise ValueError - - if not (1 <= quarter <= 4): - msg = ('Incorrect quarterly string is given, quarter must be ' - 'between 1 and 4: {0}') - raise DateParseError(msg.format(date_string)) - - if freq is not None: - # hack attack, #1228 - try: - mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 - except (KeyError, ValueError): - msg = ('Unable to retrieve month information from given ' - 'freq: {0}').format(freq) - raise DateParseError(msg) - - month = (mnum + (quarter - 1) * 3) % 12 + 1 - if month > mnum: - year -= 1 - else: - month = (quarter - 1) * 3 + 1 - - ret = default.replace(year=year, month=month) - return ret, ret, 'quarter' - - except DateParseError: - raise - except ValueError: - pass - - if date_len == 6 and (freq == 'M' or getattr( - freq, 'rule_code', None) == 'M'): - year = int(date_string[:4]) - month = int(date_string[4:6]) - try: - ret = default.replace(year=year, month=month) - return ret, ret, 'month' - except ValueError: - pass - - for pat in ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']: - try: - ret = datetime.strptime(date_string, pat) - return ret, ret, 'month' - except ValueError: - pass - - raise ValueError('Unable to parse {0}'.format(date_string)) - - -def dateutil_parse(object timestr, object default, ignoretz=False, - tzinfos=None, **kwargs): - """ lifted from dateutil to get resolution""" - - cdef: - object fobj, res, attr, ret, tzdata - object reso = None - dict repl = {} - - fobj = StringIO(str(timestr)) - res = DEFAULTPARSER._parse(fobj, **kwargs) - - # dateutil 2.2 compat - if isinstance(res, tuple): - res, _ = res - - if res is None: - msg = "Unknown datetime string format, unable to parse: {0}" - raise ValueError(msg.format(timestr)) - - for attr in ["year", "month", "day", "hour", - "minute", "second", "microsecond"]: - value = getattr(res, attr) - if value is not None: - repl[attr] = value - reso = attr - - if reso is None: - msg = "Unable to parse datetime string: {0}" - raise ValueError(msg.format(timestr)) - - if reso == 'microsecond': - if repl['microsecond'] == 0: - reso = 'second' - elif repl['microsecond'] % 1000 == 0: - reso = 'millisecond' - - ret = default.replace(**repl) - if res.weekday is not None and not res.day: - ret = ret + relativedelta.relativedelta(weekday=res.weekday) - if not ignoretz: - if callable(tzinfos) or tzinfos and res.tzname in tzinfos: - if callable(tzinfos): - tzdata = tzinfos(res.tzname, res.tzoffset) - else: - tzdata = tzinfos.get(res.tzname) - if isinstance(tzdata, datetime.tzinfo): - tzinfo = tzdata - elif isinstance(tzdata, string_types): - tzinfo = _dateutil_tzstr(tzdata) - elif isinstance(tzdata, int): - tzinfo = tzoffset(res.tzname, tzdata) - else: - raise ValueError("offset must be tzinfo subclass, " - "tz string, or int offset") - ret = ret.replace(tzinfo=tzinfo) - elif res.tzname and res.tzname in time.tzname: - ret = ret.replace(tzinfo=_dateutil_tzlocal()) - elif res.tzoffset == 0: - ret = ret.replace(tzinfo=_dateutil_tzutc()) - elif res.tzoffset: - ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) - return ret, reso - - # const for parsers _DEFAULT_DATETIME = datetime(1, 1, 1).replace( diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx new file mode 100644 index 0000000000000..845d1b8dcabba --- /dev/null +++ b/pandas/_libs/tslibs/parsing.pyx @@ -0,0 +1,681 @@ +# -*- coding: utf-8 -*- +# cython: profile=False +# cython: linetrace=False +# distutils: define_macros=CYTHON_TRACE=0 +# distutils: define_macros=CYTHON_TRACE_NOGIL=0 +""" +Parsing functions for datetime and datetime-like strings. +""" +import sys +import re + +from cpython cimport PyString_Check, PyUnicode_Check + +from libc.stdlib cimport free + +cimport cython +from cython cimport Py_ssize_t + + +from datetime import datetime +import time + +import numpy as np +cimport numpy as np +from numpy cimport int64_t, ndarray +np.import_array() + +# Avoid import from outside _libs +if sys.version_info.major == 2: + string_types = basestring + from StringIO import StringIO +else: + string_types = str + from io import StringIO + + +# dateutil compat +from dateutil.tz import (tzoffset, + tzlocal as _dateutil_tzlocal, + tzfile as _dateutil_tzfile, + tzutc as _dateutil_tzutc, + tzstr as _dateutil_tzstr) +from dateutil.relativedelta import relativedelta +from dateutil.parser import DEFAULTPARSER +from dateutil.parser import parse as du_parse + + +class DateParseError(ValueError): + pass + +_nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) + +_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, + second=0, microsecond=0) +_MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', + 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] +_MONTH_NUMBERS = {k: i for i, k in enumerate(_MONTHS)} +_MONTH_ALIASES = {(k + 1): v for k, v in enumerate(_MONTHS)} + +cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') + +cdef set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) + +NAT_SENTINEL = object() +# This allows us to reference NaT without having to import it + + +def parse_datetime_string(date_string, freq=None, dayfirst=False, + yearfirst=False, **kwargs): + """parse datetime string, only returns datetime. + Also cares special handling matching time patterns. + + Returns + ------- + datetime + """ + + cdef: + object dt + + if not _does_string_look_like_datetime(date_string): + raise ValueError('Given date string not likely a datetime.') + + if _TIMEPAT.match(date_string): + # use current datetime as default, not pass _DEFAULT_DATETIME + dt = du_parse(date_string, dayfirst=dayfirst, + yearfirst=yearfirst, **kwargs) + return dt + + try: + dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) + return dt + except DateParseError: + raise + except ValueError: + pass + + try: + dt = du_parse(date_string, default=_DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + except TypeError: + # following may be raised from dateutil + # TypeError: 'NoneType' object is not iterable + raise ValueError('Given date string not likely a datetime.') + + return dt + + +def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): + """ + Try hard to parse datetime string, leveraging dateutil plus some extra + goodies like quarter recognition. + + Parameters + ---------- + arg : compat.string_types + freq : str or DateOffset, default None + Helps with interpreting time string if supplied + dayfirst : bool, default None + If None uses default from print_config + yearfirst : bool, default None + If None uses default from print_config + + Returns + ------- + datetime, datetime/dateutil.parser._result, str + """ + if not isinstance(arg, string_types): + return arg + + if getattr(freq, "_typ", None) == "dateoffset": + freq = freq.rule_code + + if dayfirst is None: + from pandas.core.config import get_option + dayfirst = get_option("display.date_dayfirst") + if yearfirst is None: + from pandas.core.config import get_option + yearfirst = get_option("display.date_yearfirst") + + res = parse_datetime_string_with_reso(arg, freq=freq, + dayfirst=dayfirst, + yearfirst=yearfirst) + if res[0] is NAT_SENTINEL: + from pandas._libs.tslib import NaT + res = (NaT,) + res[1:] + return res + + +def parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, + yearfirst=False, **kwargs): + """parse datetime string, only returns datetime + + Returns + ------- + datetime + """ + + cdef: + object parsed, reso + + if not _does_string_look_like_datetime(date_string): + raise ValueError('Given date string not likely a datetime.') + + try: + return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) + except DateParseError: + raise + except ValueError: + pass + + try: + parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst) + except Exception as e: + # TODO: allow raise of errors within instead + raise DateParseError(e) + if parsed is None: + raise DateParseError("Could not parse %s" % date_string) + return parsed, parsed, reso + + +cpdef bint _does_string_look_like_datetime(object date_string): + if date_string.startswith('0'): + # Strings starting with 0 are more consistent with a + # date-like string than a number + return True + + try: + if float(date_string) < 1000: + return False + except ValueError: + pass + + if date_string in _not_datelike_strings: + return False + + return True + + +cdef inline object _parse_dateabbr_string(object date_string, object default, + object freq): + cdef: + object ret + int year, quarter = -1, month, mnum, date_len + + # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 + assert isinstance(date_string, string_types) + + # len(date_string) == 0 + # should be NaT??? + + if date_string in _nat_strings: + return NAT_SENTINEL, NAT_SENTINEL, '' + + date_string = date_string.upper() + date_len = len(date_string) + + if date_len == 4: + # parse year only like 2000 + try: + ret = default.replace(year=int(date_string)) + return ret, ret, 'year' + except ValueError: + pass + + try: + if 4 <= date_len <= 7: + i = date_string.index('Q', 1, 6) + if i == 1: + quarter = int(date_string[0]) + if date_len == 4 or (date_len == 5 + and date_string[i + 1] == '-'): + # r'(\d)Q-?(\d\d)') + year = 2000 + int(date_string[-2:]) + elif date_len == 6 or (date_len == 7 + and date_string[i + 1] == '-'): + # r'(\d)Q-?(\d\d\d\d)') + year = int(date_string[-4:]) + else: + raise ValueError + elif i == 2 or i == 3: + # r'(\d\d)-?Q(\d)' + if date_len == 4 or (date_len == 5 + and date_string[i - 1] == '-'): + quarter = int(date_string[-1]) + year = 2000 + int(date_string[:2]) + else: + raise ValueError + elif i == 4 or i == 5: + if date_len == 6 or (date_len == 7 + and date_string[i - 1] == '-'): + # r'(\d\d\d\d)-?Q(\d)' + quarter = int(date_string[-1]) + year = int(date_string[:4]) + else: + raise ValueError + + if not (1 <= quarter <= 4): + msg = ('Incorrect quarterly string is given, quarter must be ' + 'between 1 and 4: {0}') + raise DateParseError(msg.format(date_string)) + + if freq is not None: + # hack attack, #1228 + try: + mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 + except (KeyError, ValueError): + msg = ('Unable to retrieve month information from given ' + 'freq: {0}').format(freq) + raise DateParseError(msg) + + month = (mnum + (quarter - 1) * 3) % 12 + 1 + if month > mnum: + year -= 1 + else: + month = (quarter - 1) * 3 + 1 + + ret = default.replace(year=year, month=month) + return ret, ret, 'quarter' + + except DateParseError: + raise + except ValueError: + pass + + if date_len == 6 and (freq == 'M' or + getattr(freq, 'rule_code', None) == 'M'): + year = int(date_string[:4]) + month = int(date_string[4:6]) + try: + ret = default.replace(year=year, month=month) + return ret, ret, 'month' + except ValueError: + pass + + for pat in ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']: + try: + ret = datetime.strptime(date_string, pat) + return ret, ret, 'month' + except ValueError: + pass + + raise ValueError('Unable to parse {0}'.format(date_string)) + + +def dateutil_parse(object timestr, object default, ignoretz=False, + tzinfos=None, **kwargs): + """ lifted from dateutil to get resolution""" + + cdef: + object fobj, res, attr, ret, tzdata + object reso = None + dict repl = {} + + fobj = StringIO(str(timestr)) + res = DEFAULTPARSER._parse(fobj, **kwargs) + + # dateutil 2.2 compat + if isinstance(res, tuple): # PyTuple_Check + res, _ = res + + if res is None: + msg = "Unknown datetime string format, unable to parse: {0}" + raise ValueError(msg.format(timestr)) + + for attr in ["year", "month", "day", "hour", + "minute", "second", "microsecond"]: + value = getattr(res, attr) + if value is not None: + repl[attr] = value + reso = attr + + if reso is None: + msg = "Unable to parse datetime string: {0}" + raise ValueError(msg.format(timestr)) + + if reso == 'microsecond': + if repl['microsecond'] == 0: + reso = 'second' + elif repl['microsecond'] % 1000 == 0: + reso = 'millisecond' + + ret = default.replace(**repl) + if res.weekday is not None and not res.day: + ret = ret + relativedelta.relativedelta(weekday=res.weekday) + if not ignoretz: + if callable(tzinfos) or tzinfos and res.tzname in tzinfos: + if callable(tzinfos): + tzdata = tzinfos(res.tzname, res.tzoffset) + else: + tzdata = tzinfos.get(res.tzname) + if isinstance(tzdata, datetime.tzinfo): + tzinfo = tzdata + elif isinstance(tzdata, string_types): + tzinfo = _dateutil_tzstr(tzdata) + elif isinstance(tzdata, int): + tzinfo = tzoffset(res.tzname, tzdata) + else: + raise ValueError("offset must be tzinfo subclass, " + "tz string, or int offset") + ret = ret.replace(tzinfo=tzinfo) + elif res.tzname and res.tzname in time.tzname: + ret = ret.replace(tzinfo=_dateutil_tzlocal()) + elif res.tzoffset == 0: + ret = ret.replace(tzinfo=_dateutil_tzutc()) + elif res.tzoffset: + ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) + return ret, reso + + +cpdef object _get_rule_month(object source, object default='DEC'): + """ + Return starting month of given freq, default is December. + + Example + ------- + >>> _get_rule_month('D') + 'DEC' + + >>> _get_rule_month('A-JAN') + 'JAN' + """ + if hasattr(source, 'freqstr'): + source = source.freqstr + source = source.upper() + if '-' not in source: + return default + else: + return source.split('-')[1] + + +#---------------------------------------------------------------------- +# Parsing for type-inference + + +def try_parse_dates(ndarray[object] values, parser=None, + dayfirst=False, default=None): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(values) + result = np.empty(n, dtype='O') + + if parser is None: + if default is None: # GH2618 + date = datetime.now() + default = datetime(date.year, date.month, 1) + + parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default) + + # EAFP here + try: + for i from 0 <= i < n: + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) + except Exception: + # failed + return values + else: + parse_date = parser + + try: + for i from 0 <= i < n: + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) + except Exception: + # raise if passed parser and it failed + raise + + return result + + +def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, + date_parser=None, time_parser=None, + dayfirst=False, default=None): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(dates) + if len(times) != n: + raise ValueError('Length of dates and times must be equal') + result = np.empty(n, dtype='O') + + if date_parser is None: + if default is None: # GH2618 + date = datetime.now() + default = datetime(date.year, date.month, 1) + + parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default) + + else: + parse_date = date_parser + + if time_parser is None: + parse_time = lambda x: du_parse(x) + + else: + parse_time = time_parser + + for i from 0 <= i < n: + d = parse_date(str(dates[i])) + t = parse_time(str(times[i])) + result[i] = datetime(d.year, d.month, d.day, + t.hour, t.minute, t.second) + + return result + + +def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, + ndarray[object] days): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(years) + if len(months) != n or len(days) != n: + raise ValueError('Length of years/months/days must all be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) + + return result + + +def try_parse_datetime_components(ndarray[object] years, + ndarray[object] months, + ndarray[object] days, + ndarray[object] hours, + ndarray[object] minutes, + ndarray[object] seconds): + + cdef: + Py_ssize_t i, n + ndarray[object] result + int secs + double float_secs + double micros + + n = len(years) + if (len(months) != n or len(days) != n or len(hours) != n or + len(minutes) != n or len(seconds) != n): + raise ValueError('Length of all datetime components must be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + float_secs = float(seconds[i]) + secs = int(float_secs) + + micros = float_secs - secs + if micros > 0: + micros = micros * 1000000 + + result[i] = datetime(int(years[i]), int(months[i]), int(days[i]), + int(hours[i]), int(minutes[i]), secs, + int(micros)) + + return result + + +#---------------------------------------------------------------------- +# Miscellaneous + +_DATEUTIL_LEXER_SPLIT = None +try: + # Since these are private methods from dateutil, it is safely imported + # here so in case this interface changes, pandas will just fallback + # to not using the functionality + from dateutil.parser import _timelex + + if hasattr(_timelex, 'split'): + def _lexer_split_from_str(dt_str): + # The StringIO(str(_)) is for dateutil 2.2 compatibility + return _timelex.split(StringIO(str(dt_str))) + + _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str +except (ImportError, AttributeError): + pass + + +def _format_is_iso(f): + """ + Does format match the iso8601 set that can be handled by the C parser? + Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different + but must be consistent. Leading 0s in dates and times are optional. + """ + iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format + excluded_formats = ['%Y%m%d', '%Y%m', '%Y'] + + for date_sep in [' ', '/', '\\', '-', '.', '']: + for time_sep in [' ', 'T']: + if (iso_template(date_sep=date_sep, + time_sep=time_sep + ).startswith(f) and f not in excluded_formats): + return True + return False + + +def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, + dt_str_split=_DATEUTIL_LEXER_SPLIT): + """ + Guess the datetime format of a given datetime string. + + Parameters + ---------- + dt_str : string, datetime string to guess the format of + dayfirst : boolean, default False + If True parses dates with the day first, eg 20/01/2005 + Warning: dayfirst=True is not strict, but will prefer to parse + with day first (this is a known bug). + dt_str_parse : function, defaults to `compat.parse_date` (dateutil) + This function should take in a datetime string and return + a `datetime.datetime` guess that the datetime string represents + dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil) + This function should take in a datetime string and return + a list of strings, the guess of the various specific parts + e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30'] + + Returns + ------- + ret : datetime format string (for `strftime` or `strptime`) + """ + if dt_str_parse is None or dt_str_split is None: + return None + + if not isinstance(dt_str, string_types): + return None + + day_attribute_and_format = (('day',), '%d', 2) + + # attr name, format, padding (if any) + datetime_attrs_to_format = [ + (('year', 'month', 'day'), '%Y%m%d', 0), + (('year',), '%Y', 0), + (('month',), '%B', 0), + (('month',), '%b', 0), + (('month',), '%m', 2), + day_attribute_and_format, + (('hour',), '%H', 2), + (('minute',), '%M', 2), + (('second',), '%S', 2), + (('microsecond',), '%f', 6), + (('second', 'microsecond'), '%S.%f', 0), + ] + + if dayfirst: + datetime_attrs_to_format.remove(day_attribute_and_format) + datetime_attrs_to_format.insert(0, day_attribute_and_format) + + try: + parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst) + except: + # In case the datetime can't be parsed, its format cannot be guessed + return None + + if parsed_datetime is None: + return None + + try: + tokens = dt_str_split(dt_str) + except: + # In case the datetime string can't be split, its format cannot + # be guessed + return None + + format_guess = [None] * len(tokens) + found_attrs = set() + + for attrs, attr_format, padding in datetime_attrs_to_format: + # If a given attribute has been placed in the format string, skip + # over other formats for that same underlying attribute (IE, month + # can be represented in multiple different ways) + if set(attrs) & found_attrs: + continue + + if all(getattr(parsed_datetime, attr) is not None for attr in attrs): + for i, token_format in enumerate(format_guess): + token_filled = tokens[i].zfill(padding) + if (token_format is None and + token_filled == parsed_datetime.strftime(attr_format)): + format_guess[i] = attr_format + tokens[i] = token_filled + found_attrs.update(attrs) + break + + # Only consider it a valid guess if we have a year, month and day + if len(set(['year', 'month', 'day']) & found_attrs) != 3: + return None + + output_format = [] + for i, guess in enumerate(format_guess): + if guess is not None: + # Either fill in the format placeholder (like %Y) + output_format.append(guess) + else: + # Or just the token separate (IE, the dashes in "01-01-2013") + try: + # If the token is numeric, then we likely didn't parse it + # properly, so our guess is wrong + float(tokens[i]) + return None + except ValueError: + pass + + output_format.append(tokens[i]) + + guessed_format = ''.join(output_format) + + # rebuild string, capturing any inferred padding + dt_str = ''.join(tokens) + if parsed_datetime.strftime(guessed_format) == dt_str: + return guessed_format + else: + return None diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f28ff9697e517..79c89f4ad2e25 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7,6 +7,7 @@ algos as libalgos, join as libjoin, Timestamp, Timedelta, ) from pandas._libs.lib import is_datetime_array +from pandas._libs.tslibs import parsing from pandas.compat import range, u from pandas.compat.numpy import function as nv @@ -1037,7 +1038,7 @@ def to_datetime(self, dayfirst=False): if self.inferred_type == 'string': from dateutil.parser import parse parser = lambda x: parse(x, dayfirst=dayfirst) - parsed = lib.try_parse_dates(self.values, parser=parser) + parsed = parsing.try_parse_dates(self.values, parser=parser) return DatetimeIndex(parsed) else: return DatetimeIndex(self.values) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index bf89509fd1746..97ac8445faf4c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -2,9 +2,14 @@ import numpy as np from collections import MutableMapping -from pandas._libs import lib, tslib +from pandas._libs import tslib from pandas._libs.tslibs.strptime import array_strptime from pandas._libs.tslibs.timezones import get_timezone +from pandas._libs.tslibs import parsing +from pandas._libs.tslibs.parsing import ( # noqa + parse_time_string, + _format_is_iso, + _guess_datetime_format) from pandas.core.dtypes.common import ( _ensure_object, @@ -19,28 +24,10 @@ is_numeric_dtype) from pandas.core.dtypes.generic import ( ABCIndexClass, ABCSeries, - ABCDataFrame, ABCDateOffset) + ABCDataFrame) from pandas.core.dtypes.missing import notna from pandas.core import algorithms -import pandas.compat as compat - -_DATEUTIL_LEXER_SPLIT = None -try: - # Since these are private methods from dateutil, it is safely imported - # here so in case this interface changes, pandas will just fallback - # to not using the functionality - from dateutil.parser import _timelex - - if hasattr(_timelex, 'split'): - def _lexer_split_from_str(dt_str): - # The StringIO(str(_)) is for dateutil 2.2 compatibility - return _timelex.split(compat.StringIO(str(dt_str))) - - _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str -except (ImportError, AttributeError): - pass - def _infer_tzinfo(start, end): def _infer(a, b): @@ -60,123 +47,6 @@ def _infer(a, b): return tz -def _guess_datetime_format(dt_str, dayfirst=False, - dt_str_parse=compat.parse_date, - dt_str_split=_DATEUTIL_LEXER_SPLIT): - """ - Guess the datetime format of a given datetime string. - - Parameters - ---------- - dt_str : string, datetime string to guess the format of - dayfirst : boolean, default False - If True parses dates with the day first, eg 20/01/2005 - Warning: dayfirst=True is not strict, but will prefer to parse - with day first (this is a known bug). - dt_str_parse : function, defaults to `compat.parse_date` (dateutil) - This function should take in a datetime string and return - a `datetime.datetime` guess that the datetime string represents - dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil) - This function should take in a datetime string and return - a list of strings, the guess of the various specific parts - e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30'] - - Returns - ------- - ret : datetime format string (for `strftime` or `strptime`) - """ - if dt_str_parse is None or dt_str_split is None: - return None - - if not isinstance(dt_str, compat.string_types): - return None - - day_attribute_and_format = (('day',), '%d', 2) - - # attr name, format, padding (if any) - datetime_attrs_to_format = [ - (('year', 'month', 'day'), '%Y%m%d', 0), - (('year',), '%Y', 0), - (('month',), '%B', 0), - (('month',), '%b', 0), - (('month',), '%m', 2), - day_attribute_and_format, - (('hour',), '%H', 2), - (('minute',), '%M', 2), - (('second',), '%S', 2), - (('microsecond',), '%f', 6), - (('second', 'microsecond'), '%S.%f', 0), - ] - - if dayfirst: - datetime_attrs_to_format.remove(day_attribute_and_format) - datetime_attrs_to_format.insert(0, day_attribute_and_format) - - try: - parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst) - except: - # In case the datetime can't be parsed, its format cannot be guessed - return None - - if parsed_datetime is None: - return None - - try: - tokens = dt_str_split(dt_str) - except: - # In case the datetime string can't be split, its format cannot - # be guessed - return None - - format_guess = [None] * len(tokens) - found_attrs = set() - - for attrs, attr_format, padding in datetime_attrs_to_format: - # If a given attribute has been placed in the format string, skip - # over other formats for that same underlying attribute (IE, month - # can be represented in multiple different ways) - if set(attrs) & found_attrs: - continue - - if all(getattr(parsed_datetime, attr) is not None for attr in attrs): - for i, token_format in enumerate(format_guess): - token_filled = tokens[i].zfill(padding) - if (token_format is None and - token_filled == parsed_datetime.strftime(attr_format)): - format_guess[i] = attr_format - tokens[i] = token_filled - found_attrs.update(attrs) - break - - # Only consider it a valid guess if we have a year, month and day - if len(set(['year', 'month', 'day']) & found_attrs) != 3: - return None - - output_format = [] - for i, guess in enumerate(format_guess): - if guess is not None: - # Either fill in the format placeholder (like %Y) - output_format.append(guess) - else: - # Or just the token separate (IE, the dashes in "01-01-2013") - try: - # If the token is numeric, then we likely didn't parse it - # properly, so our guess is wrong - float(tokens[i]) - return None - except ValueError: - pass - - output_format.append(tokens[i]) - - guessed_format = ''.join(output_format) - - # rebuild string, capturing any inferred padding - dt_str = ''.join(tokens) - if parsed_datetime.strftime(guessed_format) == dt_str: - return guessed_format - - def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element non_nan_elements = notna(arr).nonzero()[0] @@ -655,9 +525,9 @@ def _attempt_YYYYMMDD(arg, errors): def calc(carg): # calculate the actual result carg = carg.astype(object) - parsed = lib.try_parse_year_month_day(carg / 10000, - carg / 100 % 100, - carg % 100) + parsed = parsing.try_parse_year_month_day(carg / 10000, + carg / 100 % 100, + carg % 100) return tslib.array_to_datetime(parsed, errors=errors) def calc_with_mask(carg, mask): @@ -691,60 +561,6 @@ def calc_with_mask(carg, mask): return None -def _format_is_iso(f): - """ - Does format match the iso8601 set that can be handled by the C parser? - Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different - but must be consistent. Leading 0s in dates and times are optional. - """ - iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format - excluded_formats = ['%Y%m%d', '%Y%m', '%Y'] - - for date_sep in [' ', '/', '\\', '-', '.', '']: - for time_sep in [' ', 'T']: - if (iso_template(date_sep=date_sep, - time_sep=time_sep - ).startswith(f) and f not in excluded_formats): - return True - return False - - -def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): - """ - Try hard to parse datetime string, leveraging dateutil plus some extra - goodies like quarter recognition. - - Parameters - ---------- - arg : compat.string_types - freq : str or DateOffset, default None - Helps with interpreting time string if supplied - dayfirst : bool, default None - If None uses default from print_config - yearfirst : bool, default None - If None uses default from print_config - - Returns - ------- - datetime, datetime/dateutil.parser._result, str - """ - from pandas.core.config import get_option - if not isinstance(arg, compat.string_types): - return arg - - if isinstance(freq, ABCDateOffset): - freq = freq.rule_code - - if dayfirst is None: - dayfirst = get_option("display.date_dayfirst") - if yearfirst is None: - yearfirst = get_option("display.date_yearfirst") - - return tslib.parse_datetime_string_with_reso(arg, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) - - DateParseError = tslib.DateParseError normalize_date = tslib.normalize_date diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 080d6c3e273a3..377373f8a0135 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -1,20 +1,20 @@ """This module is designed for community supported date conversion functions""" from pandas.compat import range, map import numpy as np -import pandas._libs.lib as lib +from pandas._libs.tslibs import parsing def parse_date_time(date_col, time_col): date_col = _maybe_cast(date_col) time_col = _maybe_cast(time_col) - return lib.try_parse_date_and_time(date_col, time_col) + return parsing.try_parse_date_and_time(date_col, time_col) def parse_date_fields(year_col, month_col, day_col): year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) - return lib.try_parse_year_month_day(year_col, month_col, day_col) + return parsing.try_parse_year_month_day(year_col, month_col, day_col) def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, @@ -25,8 +25,9 @@ def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, hour_col = _maybe_cast(hour_col) minute_col = _maybe_cast(minute_col) second_col = _maybe_cast(second_col) - return lib.try_parse_datetime_components(year_col, month_col, day_col, - hour_col, minute_col, second_col) + return parsing.try_parse_datetime_components(year_col, month_col, day_col, + hour_col, minute_col, + second_col) def generic_parser(parse_func, *cols): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ed15d4295d688..eeb79552477e1 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -42,7 +42,7 @@ import pandas._libs.lib as lib import pandas._libs.parsers as parsers - +from pandas._libs.tslibs import parsing # BOM character (byte order mark) # This exists at the beginning of a file to indicate endianness @@ -2981,7 +2981,7 @@ def converter(*date_cols): ) except: return tools.to_datetime( - lib.try_parse_dates(strs, dayfirst=dayfirst)) + parsing.try_parse_dates(strs, dayfirst=dayfirst)) else: try: result = tools.to_datetime( @@ -2992,9 +2992,9 @@ def converter(*date_cols): except Exception: try: return tools.to_datetime( - lib.try_parse_dates(_concat_date_cols(date_cols), - parser=date_parser, - dayfirst=dayfirst), + parsing.try_parse_dates(_concat_date_cols(date_cols), + parser=date_parser, + dayfirst=dayfirst), errors='ignore') except Exception: return generic_parser(date_parser, *date_cols) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index e0ccedb834adf..bdfe6b5b09e45 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -12,7 +12,8 @@ from distutils.version import LooseVersion import pandas as pd -from pandas._libs import tslib, lib +from pandas._libs import tslib +from pandas._libs.tslibs import parsing from pandas.core.tools import datetimes as tools from pandas.core.tools.datetimes import normalize_date from pandas.compat import lmap @@ -1063,7 +1064,7 @@ def test_does_not_convert_mixed_integer(self): bad_date_strings = ('-50000', '999', '123.1234', 'm', 'T') for bad_date_string in bad_date_strings: - assert not tslib._does_string_look_like_datetime(bad_date_string) + assert not parsing._does_string_look_like_datetime(bad_date_string) good_date_strings = ('2012-01-01', '01/01/2012', @@ -1073,7 +1074,7 @@ def test_does_not_convert_mixed_integer(self): '1-1', ) for good_date_string in good_date_strings: - assert tslib._does_string_look_like_datetime(good_date_string) + assert parsing._does_string_look_like_datetime(good_date_string) def test_parsers(self): @@ -1412,7 +1413,7 @@ class TestArrayToDatetime(object): def test_try_parse_dates(self): arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object) - result = lib.try_parse_dates(arr, dayfirst=True) + result = parsing.try_parse_dates(arr, dayfirst=True) expected = [parse(d, dayfirst=True) for d in arr] assert np.array_equal(result, expected) diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index e1ae1b577ea29..90103e7bf26b0 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -10,7 +10,7 @@ import pytest import numpy as np -import pandas._libs.lib as lib +from pandas._libs.tslibs import parsing from pandas._libs.lib import Timestamp import pandas as pd @@ -53,7 +53,8 @@ def test_multiple_date_col(self): """ def func(*date_cols): - return lib.try_parse_dates(parsers._concat_date_cols(date_cols)) + res = parsing.try_parse_dates(parsers._concat_date_cols(date_cols)) + return res df = self.read_csv(StringIO(data), header=None, date_parser=func, diff --git a/setup.py b/setup.py index 25a4924dad0bc..d25ae4a5fb45c 100755 --- a/setup.py +++ b/setup.py @@ -343,6 +343,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/parsers.pyx', 'pandas/_libs/tslibs/timezones.pyx', 'pandas/_libs/tslibs/frequencies.pyx', + 'pandas/_libs/tslibs/parsing.pyx', 'pandas/io/sas/sas.pyx'] def initialize_options(self): @@ -498,6 +499,8 @@ def pxd(name): 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c', 'pandas/_libs/src/period_helper.c']}, + '_libs.tslibs.parsing': {'pyxfile': '_libs/tslibs/parsing', + 'pxdfiles': ['_libs/src/util']}, '_libs.tslibs.frequencies': {'pyxfile': '_libs/tslibs/frequencies', 'pxdfiles': ['_libs/src/util']}, '_libs.index': {'pyxfile': '_libs/index',