pandas-dev · MarcoGorelli · Dec 14, 2022 · MarcoGorelli · Dec 14, 2022 · MarcoGorelli
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -39,6 +39,7 @@ from pandas._libs.tslibs.np_datetime cimport (
     pydatetime_to_dt64,
     string_to_dts,
 )
+from pandas._libs.tslibs.strptime cimport strptime
 from pandas._libs.util cimport (
     is_datetime64_object,
     is_float_object,
@@ -75,6 +76,19 @@ from pandas._libs.tslibs.timestamps import Timestamp
 from pandas._libs.missing cimport checknull_with_nat_and_na
 from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
 
+from _thread import allocate_lock as _thread_allocate_lock
+
+from _strptime import _getlang
+
+from pandas._libs.tslibs.strptime import TimeRE
+
+_cache_lock = _thread_allocate_lock()
+# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock
+# first!
+_TimeRE_cache = TimeRE()
+_CACHE_MAX_SIZE = 5  # Max number of regexes stored in _regex_cache
+_regex_cache = {}
+
 
 def _test_parse_iso8601(ts: str):
     """
@@ -524,6 +538,41 @@ cpdef array_to_datetime(
     result = np.empty(n, dtype="M8[ns]")
     iresult = result.view("i8")
 
+    if format is not None and not require_iso8601:
+        if "%W" in format or "%U" in format:
+            if "%Y" not in format and "%y" not in format:
+                raise ValueError("Cannot use '%W' or '%U' without day and year")
+            if "%A" not in format and "%a" not in format and "%w" not in format:
+                raise ValueError("Cannot use '%W' or '%U' without day and year")
+        elif "%Z" in format and "%z" in format:
+            raise ValueError("Cannot parse both %Z and %z")
+
+        global _TimeRE_cache, _regex_cache
+        with _cache_lock:
+            if _getlang() != _TimeRE_cache.locale_time.lang:
+                _TimeRE_cache = TimeRE()
+                _regex_cache.clear()
+            if len(_regex_cache) > _CACHE_MAX_SIZE:
+                _regex_cache.clear()
+            locale_time = _TimeRE_cache.locale_time
+            format_regex = _regex_cache.get(format)
+            if not format_regex:
+                try:
+                    format_regex = _TimeRE_cache.compile(format)
+                # KeyError raised when a bad format is found; can be specified as
+                # \\, in which case it was a stray % but with a space after it
+                except KeyError, err:
+                    bad_directive = err.args[0]
+                    if bad_directive == "\\":
+                        bad_directive = "%"
+                    del err
+                    raise ValueError(f"'{bad_directive}' is a bad directive "
+                                     f"in format '{format}'")
+                # IndexError only occurs when the format string is "%"
+                except IndexError:
+                    raise ValueError(f"stray % in format '{format}'")
+                _regex_cache[format] = format_regex
 if fmt is not None: 
     if "%W" in fmt or "%U" in fmt: 
         if "%Y" not in fmt and "%y" not in fmt: 
             raise ValueError("Cannot use '%W' or '%U' without day and year") 
         if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt: 
             raise ValueError("Cannot use '%W' or '%U' without day and year") 
     elif "%Z" in fmt and "%z" in fmt: 
         raise ValueError("Cannot parse both %Z and %z") 
 global _TimeRE_cache, _regex_cache 
 with _cache_lock: 
     if _getlang() != _TimeRE_cache.locale_time.lang: 
         _TimeRE_cache = TimeRE() 
         _regex_cache.clear() 
     if len(_regex_cache) > _CACHE_MAX_SIZE: 
         _regex_cache.clear() 
     locale_time = _TimeRE_cache.locale_time 
     format_regex = _regex_cache.get(fmt) 
     if not format_regex: 
         try: 
             format_regex = _TimeRE_cache.compile(fmt) 
         # KeyError raised when a bad format is found; can be specified as 
         # \\, in which case it was a stray % but with a space after it 
         except KeyError, err: 
             bad_directive = err.args[0] 
             if bad_directive == "\\": 
                 bad_directive = "%" 
             del err 
             raise ValueError(f"'{bad_directive}' is a bad directive " 
                              f"in format '{fmt}'") 
         # IndexError only occurs when the format string is "%" 
         except IndexError: 
             raise ValueError(f"stray % in format '{fmt}'") 
         _regex_cache[fmt] = format_regex 
 if fmt is not None: 
     if "%W" in fmt or "%U" in fmt: 
         if "%Y" not in fmt and "%y" not in fmt: 
             raise ValueError("Cannot use '%W' or '%U' without day and year") 
         if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt: 
             raise ValueError("Cannot use '%W' or '%U' without day and year") 
     elif "%Z" in fmt and "%z" in fmt: 
         raise ValueError("Cannot parse both %Z and %z") 
  
 global _TimeRE_cache, _regex_cache 
 with _cache_lock: 
     if _getlang() != _TimeRE_cache.locale_time.lang: 
         _TimeRE_cache = TimeRE() 
         _regex_cache.clear() 
     if len(_regex_cache) > _CACHE_MAX_SIZE: 
         _regex_cache.clear() 
     locale_time = _TimeRE_cache.locale_time 
     format_regex = _regex_cache.get(fmt) 
     if not format_regex: 
         try: 
             format_regex = _TimeRE_cache.compile(fmt) 
         # KeyError raised when a bad format is found; can be specified as 
         # \\, in which case it was a stray % but with a space after it 
         except KeyError, err: 
             bad_directive = err.args[0] 
             if bad_directive == "\\": 
                 bad_directive = "%" 
             del err 
             raise ValueError(f"'{bad_directive}' is a bad directive " 
                              f"in format '{fmt}'") 
         # IndexError only occurs when the format string is "%" 
         except IndexError: 
             raise ValueError(f"stray % in format '{fmt}'") 
         _regex_cache[fmt] = format_regex 
+
     try:
         for i in range(n):
             val = values[i]
@@ -556,17 +605,10 @@ cpdef array_to_datetime(
                     seen_datetime = True
                     iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
 
-                elif is_integer_object(val) or is_float_object(val):
-                    if require_iso8601:
-                        if is_coerce:
-                            iresult[i] = NPY_NAT
-                            continue
-                        elif is_raise:
-                            raise ValueError(
-                                f"time data \"{val}\" at position {i} doesn't "
-                                f"match format \"{format}\""
-                            )
-                        return values, tz_out
+                elif (
+                    (is_integer_object(val) or is_float_object(val))
+                    and format is None
+                ):
                     # these must be ns unit by-definition
                     seen_integer = True
 
@@ -585,7 +627,15 @@ cpdef array_to_datetime(
                         except OverflowError:
                             iresult[i] = NPY_NAT
 
-                elif isinstance(val, str):
+                elif (
+                    (is_integer_object(val) or is_float_object(val))
+                    or isinstance(val, str)
+                ):
+                    if not isinstance(val, str):
+                        if val != val or val == NPY_NAT:
+                            iresult[i] = NPY_NAT
+                            continue
+
                     # string
                     if type(val) is not str:
                         # GH#32264 np.str_ object
@@ -595,6 +645,42 @@ cpdef array_to_datetime(
                         iresult[i] = NPY_NAT
                         continue
 
+                    if (
+                        format is not None
+                        and (
+                            not require_iso8601
+                            or (
+                                require_iso8601 and format == "%Y%m%d" and len(val) != 8
+                            )
+                        )
+                        and val not in ("today", "now")
+                    ):
+                        try:
+                            _iresult, _tzinfo = strptime(
+                                val, format, exact, format_regex, locale_time, dts
+                            )
+                        except (ValueError, OverflowError):
+                            if is_coerce:
+                                iresult[i] = NPY_NAT
+                                continue
+                            elif is_raise:
+                                raise
+                            return values, tz_out
+                        value = tz_localize_to_utc_single(_iresult, _tzinfo)
+                        if _tzinfo is not None:
+                            found_tz = True
+                            tz_out = convert_timezone(
+                                _tzinfo,
+                                tz_out,
+                                found_naive,
+                                found_tz,
+                                utc_convert,
+                            )
+                        else:
+                            found_naive = True
+                        iresult[i] = value
+                        continue
+
                     string_to_dts_failed = string_to_dts(
                         val, &dts, &out_bestunit, &out_local,
                         &out_tzoffset, False, format, exact

diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd
@@ -0,0 +1,11 @@
+from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct
+
+
+cdef strptime(
+    val,
+    str fmt,
+    bint exact,
+    format_regex,
+    locale_time,
+    npy_datetimestruct dts,
+)
diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
@@ -71,6 +71,196 @@ cdef dict _parse_code_table = {"y": 0,
                                "V": 21,
                                "u": 22}
 
+cdef strptime(
+    val,
+    str fmt,
+    bint exact,
+    format_regex,
+    locale_time,
+    npy_datetimestruct dts,
+):
+    if exact:
+        found = format_regex.match(val)
+        if not found:
+            raise ValueError(f"time data '{val}' does not match "
+                             f"format '{fmt}' (match)")
+        if len(val) != found.end():
+            raise ValueError(f"unconverted data remains: {val[found.end():]}")
+
+    # search
+    else:
+        found = format_regex.search(val)
+        if not found:
+            raise ValueError(f"time data {repr(val)} does not match format "
+                             f"{repr(fmt)} (search)")
+
+    iso_year = -1
+    year = 1900
+    month = day = 1
+    hour = minute = second = ns = us = 0
+    tz = None
+    # Default to -1 to signify that values not known; not critical to have,
+    # though
+    iso_week = week_of_year = -1
+    week_of_year_start = -1
+    # weekday and julian defaulted to -1 so as to signal need to calculate
+    # values
+    weekday = julian = -1
+    found_dict = found.groupdict()
+    for group_key in found_dict.iterkeys():
+        # Directives not explicitly handled below:
+        #   c, x, X
+        #      handled by making out of other directives
+        #   U, W
+        #      worthless without day of the week
+        parse_code = _parse_code_table[group_key]
+
+        if parse_code == 0:
+            year = int(found_dict["y"])
+            # Open Group specification for strptime() states that a %y
+            # value in the range of [00, 68] is in the century 2000, while
+            # [69,99] is in the century 1900
+            if year <= 68:
+                year += 2000
+            else:
+                year += 1900
+        elif parse_code == 1:
+            year = int(found_dict["Y"])
+        elif parse_code == 2:
+            month = int(found_dict["m"])
+        # elif group_key == 'B':
+        elif parse_code == 3:
+            month = locale_time.f_month.index(found_dict["B"].lower())
+        # elif group_key == 'b':
+        elif parse_code == 4:
+            month = locale_time.a_month.index(found_dict["b"].lower())
+        # elif group_key == 'd':
+        elif parse_code == 5:
+            day = int(found_dict["d"])
+        # elif group_key == 'H':
+        elif parse_code == 6:
+            hour = int(found_dict["H"])
+        elif parse_code == 7:
+            hour = int(found_dict["I"])
+            ampm = found_dict.get("p", "").lower()
+            # If there was no AM/PM indicator, we'll treat this like AM
+            if ampm in ("", locale_time.am_pm[0]):
+                # We're in AM so the hour is correct unless we're
+                # looking at 12 midnight.
+                # 12 midnight == 12 AM == hour 0
+                if hour == 12:
+                    hour = 0
+            elif ampm == locale_time.am_pm[1]:
+                # We're in PM so we need to add 12 to the hour unless
+                # we're looking at 12 noon.
+                # 12 noon == 12 PM == hour 12
+                if hour != 12:
+                    hour += 12
+        elif parse_code == 8:
+            minute = int(found_dict["M"])
+        elif parse_code == 9:
+            second = int(found_dict["S"])
+        elif parse_code == 10:
+            s = found_dict["f"]
+            # Pad to always return nanoseconds
+            s += "0" * (9 - len(s))
+            us = long(s)
+            ns = us % 1000
+            us = us // 1000
+        elif parse_code == 11:
+            weekday = locale_time.f_weekday.index(found_dict["A"].lower())
+        elif parse_code == 12:
+            weekday = locale_time.a_weekday.index(found_dict["a"].lower())
+        elif parse_code == 13:
+            weekday = int(found_dict["w"])
+            if weekday == 0:
+                weekday = 6
+            else:
+                weekday -= 1
+        elif parse_code == 14:
+            julian = int(found_dict["j"])
+        elif parse_code == 15 or parse_code == 16:
+            week_of_year = int(found_dict[group_key])
+            if group_key == "U":
+                # U starts week on Sunday.
+                week_of_year_start = 6
+            else:
+                # W starts week on Monday.
+                week_of_year_start = 0
+        elif parse_code == 17:
+            tz = pytz.timezone(found_dict["Z"])
+        elif parse_code == 19:
+            tz = parse_timezone_directive(found_dict["z"])
+        elif parse_code == 20:
+            iso_year = int(found_dict["G"])
+        elif parse_code == 21:
+            iso_week = int(found_dict["V"])
+        elif parse_code == 22:
+            weekday = int(found_dict["u"])
+            weekday -= 1
+
+    # don't assume default values for ISO week/year
+    if iso_year != -1:
+        if iso_week == -1 or weekday == -1:
+            raise ValueError("ISO year directive '%G' must be used with "
+                             "the ISO week directive '%V' and a weekday "
+                             "directive '%A', '%a', '%w', or '%u'.")
+        if julian != -1:
+            raise ValueError("Day of the year directive '%j' is not "
+                             "compatible with ISO year directive '%G'. "
+                             "Use '%Y' instead.")
+    elif year != -1 and week_of_year == -1 and iso_week != -1:
+        if weekday == -1:
+            raise ValueError("ISO week directive '%V' must be used with "
+                             "the ISO year directive '%G' and a weekday "
+                             "directive '%A', '%a', '%w', or '%u'.")
+        else:
+            raise ValueError("ISO week directive '%V' is incompatible with "
+                             "the year directive '%Y'. Use the ISO year "
+                             "'%G' instead.")
+
+    # If we know the wk of the year and what day of that wk, we can figure
+    # out the Julian day of the year.
+    if julian == -1 and weekday != -1:
+        if week_of_year != -1:
+            week_starts_Mon = week_of_year_start == 0
+            julian = _calc_julian_from_U_or_W(year, week_of_year, weekday,
+                                              week_starts_Mon)
+        elif iso_year != -1 and iso_week != -1:
+            year, julian = _calc_julian_from_V(iso_year, iso_week,
+                                               weekday + 1)
+    # Cannot pre-calculate date() since can change in Julian
+    # calculation and thus could have different value for the day of the wk
+    # calculation.
+    if julian == -1:
+        # Need to add 1 to result since first day of the year is 1, not
+        # 0.
+        ordinal = date(year, month, day).toordinal()
+        julian = ordinal - date(year, 1, 1).toordinal() + 1
+    else:
+        # Assume that if they bothered to include Julian day it will
+        # be accurate.
+        datetime_result = date.fromordinal(
+            (julian - 1) + date(year, 1, 1).toordinal())
+        year = datetime_result.year
+        month = datetime_result.month
+        day = datetime_result.day
+    if weekday == -1:
+        weekday = date(year, month, day).weekday()
+
+    dts.year = year
+    dts.month = month
+    dts.day = day
+    dts.hour = hour
+    dts.min = minute
+    dts.sec = second
+    dts.us = us
+    dts.ps = ns * 1000
+
+    iresult = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
+    check_dts_bounds(&dts)
+    return iresult, tz
+
 
 def array_strptime(
     ndarray[object] values,