From 71ecc0f180954128ac4ab9a75eb84285c683a59c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 15 Dec 2022 10:26:50 +0000 Subject: [PATCH 1/4] less broad try-except --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/tslibs/strptime.pyx | 414 +++++++++++++------------ pandas/core/tools/datetimes.py | 28 +- pandas/tests/tools/test_to_datetime.py | 43 ++- 4 files changed, 252 insertions(+), 234 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 4425565fbcc8a..9c5656a1e68fd 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -794,6 +794,7 @@ Datetimelike - Bug in :func:`to_datetime` was raising ``ValueError`` when parsing empty string and non-ISO8601 format was passed. Now, empty strings will be parsed as :class:`NaT`, for compatibility with how is done for ISO8601 formats (:issue:`50251`) - Bug in :class:`Timestamp` was showing ``UserWarning``, which was not actionable by users, when parsing non-ISO8601 delimited date strings (:issue:`50232`) - Bug in :func:`to_datetime` was showing misleading ``ValueError`` when parsing dates with format containing ISO week directive and ISO weekday directive (:issue:`50308`) +- Bug in :func:`to_datetime` was not raising ``ValueError`` when invalid format was passed and ``errors`` was ``'ignore'`` or ``'coerce'`` (:issue:`50266`) - Timedelta diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 53330e806215c..fd5b262d7e3b9 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -41,6 +41,7 @@ from pandas._libs.tslibs.np_datetime cimport ( pydate_to_dt64, pydatetime_to_dt64, ) +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.util cimport ( is_datetime64_object, @@ -188,192 +189,203 @@ def array_strptime( for i in range(n): val = values[i] - if isinstance(val, str): - if len(val) == 0 or val in nat_strings: - iresult[i] = NPY_NAT - continue - elif checknull_with_nat_and_na(val): - iresult[i] = NPY_NAT - continue - elif PyDateTime_Check(val): - if val.tzinfo is not None: - found_tz = True - else: - found_naive = True - tz_out = convert_timezone( - val.tzinfo, - tz_out, - found_naive, - found_tz, - utc, - ) - if isinstance(val, _Timestamp): - iresult[i] = val.tz_localize(None).as_unit("ns").value - else: - iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts) - check_dts_bounds(&dts) - result_timezone[i] = val.tzinfo - continue - elif PyDate_Check(val): - iresult[i] = pydate_to_dt64(val, &dts) - check_dts_bounds(&dts) - continue - elif is_datetime64_object(val): - iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) - continue - elif ( - (is_integer_object(val) or is_float_object(val)) - and (val != val or val == NPY_NAT) - ): - iresult[i] = NPY_NAT - continue - else: - val = str(val) - - # exact matching - if exact: - found = format_regex.match(val) - if not found: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError(f"time data '{val}' does not match " - f"format '{fmt}' (match)") - if len(val) != found.end(): - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError(f"unconverted data remains: {val[found.end():]}") - - # search - else: - found = format_regex.search(val) - if not found: - if is_coerce: + try: + if isinstance(val, str): + if val in nat_strings: iresult[i] = NPY_NAT continue - raise ValueError(f"time data {repr(val)} does not match format " - f"{repr(fmt)} (search)") - - iso_year = -1 - year = 1900 - month = day = 1 - hour = minute = second = ns = us = 0 - tz = None - # Default to -1 to signify that values not known; not critical to have, - # though - iso_week = week_of_year = -1 - week_of_year_start = -1 - # weekday and julian defaulted to -1 so as to signal need to calculate - # values - weekday = julian = -1 - found_dict = found.groupdict() - for group_key in found_dict.iterkeys(): - # Directives not explicitly handled below: - # c, x, X - # handled by making out of other directives - # U, W - # worthless without day of the week - parse_code = _parse_code_table[group_key] - - if parse_code == 0: - year = int(found_dict["y"]) - # Open Group specification for strptime() states that a %y - # value in the range of [00, 68] is in the century 2000, while - # [69,99] is in the century 1900 - if year <= 68: - year += 2000 + elif checknull_with_nat_and_na(val): + iresult[i] = NPY_NAT + continue + elif PyDateTime_Check(val): + if val.tzinfo is not None: + found_tz = True else: - year += 1900 - elif parse_code == 1: - year = int(found_dict["Y"]) - elif parse_code == 2: - month = int(found_dict["m"]) - # elif group_key == 'B': - elif parse_code == 3: - month = locale_time.f_month.index(found_dict["B"].lower()) - # elif group_key == 'b': - elif parse_code == 4: - month = locale_time.a_month.index(found_dict["b"].lower()) - # elif group_key == 'd': - elif parse_code == 5: - day = int(found_dict["d"]) - # elif group_key == 'H': - elif parse_code == 6: - hour = int(found_dict["H"]) - elif parse_code == 7: - hour = int(found_dict["I"]) - ampm = found_dict.get("p", "").lower() - # If there was no AM/PM indicator, we'll treat this like AM - if ampm in ("", locale_time.am_pm[0]): - # We're in AM so the hour is correct unless we're - # looking at 12 midnight. - # 12 midnight == 12 AM == hour 0 - if hour == 12: - hour = 0 - elif ampm == locale_time.am_pm[1]: - # We're in PM so we need to add 12 to the hour unless - # we're looking at 12 noon. - # 12 noon == 12 PM == hour 12 - if hour != 12: - hour += 12 - elif parse_code == 8: - minute = int(found_dict["M"]) - elif parse_code == 9: - second = int(found_dict["S"]) - elif parse_code == 10: - s = found_dict["f"] - # Pad to always return nanoseconds - s += "0" * (9 - len(s)) - us = long(s) - ns = us % 1000 - us = us // 1000 - elif parse_code == 11: - weekday = locale_time.f_weekday.index(found_dict["A"].lower()) - elif parse_code == 12: - weekday = locale_time.a_weekday.index(found_dict["a"].lower()) - elif parse_code == 13: - weekday = int(found_dict["w"]) - if weekday == 0: - weekday = 6 + found_naive = True + tz_out = convert_timezone( + val.tzinfo, + tz_out, + found_naive, + found_tz, + utc, + ) + if isinstance(val, _Timestamp): + iresult[i] = val.tz_localize(None).as_unit("ns").value else: + iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts) + check_dts_bounds(&dts) + result_timezone[i] = val.tzinfo + continue + elif PyDate_Check(val): + iresult[i] = pydate_to_dt64(val, &dts) + check_dts_bounds(&dts) + continue + elif is_datetime64_object(val): + iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) + continue + elif ( + (is_integer_object(val) or is_float_object(val)) + and (val != val or val == NPY_NAT) + ): + iresult[i] = NPY_NAT + continue + else: + val = str(val) + + # exact matching + if exact: + found = format_regex.match(val) + if not found: + raise ValueError(f"time data '{val}' does not match " + f"format '{fmt}' (match)") + if len(val) != found.end(): + raise ValueError(f"unconverted data remains: {val[found.end():]}") + + # search + else: + found = format_regex.search(val) + if not found: + raise ValueError(f"time data {repr(val)} does not match format " + f"{repr(fmt)} (search)") + + iso_year = -1 + year = 1900 + month = day = 1 + hour = minute = second = ns = us = 0 + tz = None + # Default to -1 to signify that values not known; not critical to have, + # though + iso_week = week_of_year = -1 + week_of_year_start = -1 + # weekday and julian defaulted to -1 so as to signal need to calculate + # values + weekday = julian = -1 + found_dict = found.groupdict() + for group_key in found_dict.iterkeys(): + # Directives not explicitly handled below: + # c, x, X + # handled by making out of other directives + # U, W + # worthless without day of the week + parse_code = _parse_code_table[group_key] + + if parse_code == 0: + year = int(found_dict["y"]) + # Open Group specification for strptime() states that a %y + # value in the range of [00, 68] is in the century 2000, while + # [69,99] is in the century 1900 + if year <= 68: + year += 2000 + else: + year += 1900 + elif parse_code == 1: + year = int(found_dict["Y"]) + elif parse_code == 2: + month = int(found_dict["m"]) + # elif group_key == 'B': + elif parse_code == 3: + month = locale_time.f_month.index(found_dict["B"].lower()) + # elif group_key == 'b': + elif parse_code == 4: + month = locale_time.a_month.index(found_dict["b"].lower()) + # elif group_key == 'd': + elif parse_code == 5: + day = int(found_dict["d"]) + # elif group_key == 'H': + elif parse_code == 6: + hour = int(found_dict["H"]) + elif parse_code == 7: + hour = int(found_dict["I"]) + ampm = found_dict.get("p", "").lower() + # If there was no AM/PM indicator, we'll treat this like AM + if ampm in ("", locale_time.am_pm[0]): + # We're in AM so the hour is correct unless we're + # looking at 12 midnight. + # 12 midnight == 12 AM == hour 0 + if hour == 12: + hour = 0 + elif ampm == locale_time.am_pm[1]: + # We're in PM so we need to add 12 to the hour unless + # we're looking at 12 noon. + # 12 noon == 12 PM == hour 12 + if hour != 12: + hour += 12 + elif parse_code == 8: + minute = int(found_dict["M"]) + elif parse_code == 9: + second = int(found_dict["S"]) + elif parse_code == 10: + s = found_dict["f"] + # Pad to always return nanoseconds + s += "0" * (9 - len(s)) + us = long(s) + ns = us % 1000 + us = us // 1000 + elif parse_code == 11: + weekday = locale_time.f_weekday.index(found_dict["A"].lower()) + elif parse_code == 12: + weekday = locale_time.a_weekday.index(found_dict["a"].lower()) + elif parse_code == 13: + weekday = int(found_dict["w"]) + if weekday == 0: + weekday = 6 + else: + weekday -= 1 + elif parse_code == 14: + julian = int(found_dict["j"]) + elif parse_code == 15 or parse_code == 16: + week_of_year = int(found_dict[group_key]) + if group_key == "U": + # U starts week on Sunday. + week_of_year_start = 6 + else: + # W starts week on Monday. + week_of_year_start = 0 + elif parse_code == 17: + tz = pytz.timezone(found_dict["Z"]) + elif parse_code == 19: + tz = parse_timezone_directive(found_dict["z"]) + elif parse_code == 20: + iso_year = int(found_dict["G"]) + elif parse_code == 21: + iso_week = int(found_dict["V"]) + elif parse_code == 22: + weekday = int(found_dict["u"]) weekday -= 1 - elif parse_code == 14: - julian = int(found_dict["j"]) - elif parse_code == 15 or parse_code == 16: - week_of_year = int(found_dict[group_key]) - if group_key == "U": - # U starts week on Sunday. - week_of_year_start = 6 + + # don't assume default values for ISO week/year + if iso_year != -1: + if iso_week == -1 or weekday == -1: + raise ValueError("ISO year directive '%G' must be used with " + "the ISO week directive '%V' and a weekday " + "directive '%A', '%a', '%w', or '%u'.") + if julian != -1: + raise ValueError("Day of the year directive '%j' is not " + "compatible with ISO year directive '%G'. " + "Use '%Y' instead.") + elif year != -1 and week_of_year == -1 and iso_week != -1: + if weekday == -1: + raise ValueError("ISO week directive '%V' must be used with " + "the ISO year directive '%G' and a weekday " + "directive '%A', '%a', '%w', or '%u'.") else: - # W starts week on Monday. - week_of_year_start = 0 - elif parse_code == 17: - tz = pytz.timezone(found_dict["Z"]) - elif parse_code == 19: - tz = parse_timezone_directive(found_dict["z"]) - elif parse_code == 20: - iso_year = int(found_dict["G"]) - elif parse_code == 21: - iso_week = int(found_dict["V"]) - elif parse_code == 22: - weekday = int(found_dict["u"]) - weekday -= 1 - - # If we know the wk of the year and what day of that wk, we can figure - # out the Julian day of the year. - if julian == -1 and weekday != -1: - if week_of_year != -1: - week_starts_Mon = week_of_year_start == 0 - julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, - week_starts_Mon) - elif iso_year != -1 and iso_week != -1: - year, julian = _calc_julian_from_V(iso_year, iso_week, - weekday + 1) - # Cannot pre-calculate date() since can change in Julian - # calculation and thus could have different value for the day of the wk - # calculation. - try: + raise ValueError("ISO week directive '%V' is incompatible with " + "the year directive '%Y'. Use the ISO year " + "'%G' instead.") + + # If we know the wk of the year and what day of that wk, we can figure + # out the Julian day of the year. + if julian == -1 and weekday != -1: + if week_of_year != -1: + week_starts_Mon = week_of_year_start == 0 + julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, + week_starts_Mon) + elif iso_year != -1 and iso_week != -1: + year, julian = _calc_julian_from_V(iso_year, iso_week, + weekday + 1) + # Cannot pre-calculate date() since can change in Julian + # calculation and thus could have different value for the day of the wk + # calculation. if julian == -1: # Need to add 1 to result since first day of the year is 1, not # 0. @@ -387,33 +399,29 @@ def array_strptime( year = datetime_result.year month = datetime_result.month day = datetime_result.day - except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - if weekday == -1: - weekday = date(year, month, day).weekday() - - dts.year = year - dts.month = month - dts.day = day - dts.hour = hour - dts.min = minute - dts.sec = second - dts.us = us - dts.ps = ns * 1000 - - iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) - try: + if weekday == -1: + weekday = date(year, month, day).weekday() + + dts.year = year + dts.month = month + dts.day = day + dts.hour = hour + dts.min = minute + dts.sec = second + dts.us = us + dts.ps = ns * 1000 + + iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) check_dts_bounds(&dts) - except ValueError: + result_timezone[i] = tz + except (ValueError, OutOfBoundsDatetime): if is_coerce: iresult[i] = NPY_NAT continue - raise - - result_timezone[i] = tz + elif is_raise: + raise + # Return input unaltered if errors='ignore' + return values, [] return result, result_timezone.base diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 885eb6286393b..a97a866a8406e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -483,31 +483,9 @@ def _array_strptime_with_fallback( """ Call array_strptime, with fallback behavior depending on 'errors'. """ - try: - result, timezones = array_strptime( - arg, fmt, exact=exact, errors=errors, utc=utc - ) - except OutOfBoundsDatetime: - if errors == "raise": - raise - if errors == "coerce": - result = np.empty(arg.shape, dtype="M8[ns]") - iresult = result.view("i8") - iresult.fill(iNaT) - else: - result = arg - except ValueError: - if errors == "raise": - raise - if errors == "coerce": - result = np.empty(arg.shape, dtype="M8[ns]") - iresult = result.view("i8") - iresult.fill(iNaT) - else: - result = arg - else: - if any(tz is not None for tz in timezones): - return _return_parsed_timezone_results(result, timezones, utc, name) + result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) + if any(tz is not None for tz in timezones): + return _return_parsed_timezone_results(result, timezones, utc, name) return _box_as_indexlike(result, utc=utc, name=name) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index db5a968458044..b087ca90b18e6 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1109,9 +1109,17 @@ def test_datetime_invalid_datatype(self, arg): with pytest.raises(TypeError, match=msg): to_datetime(arg) + @pytest.mark.parametrize("errors", ["coerce", "raise", "ignore"]) + def test_invalid_format_raises(self, errors): + # https://github.com/pandas-dev/pandas/issues/50255 + with pytest.raises( + ValueError, match="':' is a bad directive in format 'H%:M%:S%" + ): + to_datetime(["00:00:00"], format="H%:M%:S%", errors=errors) + @pytest.mark.parametrize("value", ["a", "00:01:99"]) @pytest.mark.parametrize( - "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + "format,warning", [(None, UserWarning), ("%H:%M:%S", None)] ) def test_datetime_invalid_scalar(self, value, format, warning): # GH24763 @@ -1124,7 +1132,8 @@ def test_datetime_invalid_scalar(self, value, format, warning): assert res is NaT msg = ( - "is a bad directive in format|" + "does not match format|" + "unconverted data remains:|" "second must be in 0..59|" f"Given date string {value} not likely a datetime" ) @@ -1134,7 +1143,7 @@ def test_datetime_invalid_scalar(self, value, format, warning): @pytest.mark.parametrize("value", ["3000/12/11 00:00:00"]) @pytest.mark.parametrize( - "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + "format,warning", [(None, UserWarning), ("%H:%M:%S", None)] ) def test_datetime_outofbounds_scalar(self, value, format, warning): # GH24763 @@ -1147,7 +1156,7 @@ def test_datetime_outofbounds_scalar(self, value, format, warning): assert res is NaT if format is not None: - msg = "is a bad directive in format|Out of bounds .* present at position 0" + msg = "does not match format|Out of bounds .* present at position 0" with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: @@ -1159,7 +1168,7 @@ def test_datetime_outofbounds_scalar(self, value, format, warning): @pytest.mark.parametrize("values", [["a"], ["00:01:99"], ["a", "b", "99:00:00"]]) @pytest.mark.parametrize( - "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + "format,warning", [(None, UserWarning), ("%H:%M:%S", None)] ) def test_datetime_invalid_index(self, values, format, warning): # GH24763 @@ -1172,7 +1181,8 @@ def test_datetime_invalid_index(self, values, format, warning): tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) msg = ( - "is a bad directive in format|" + "does not match format|" + "unconverted data remains:|" f"Given date string {values[0]} not likely a datetime|" "second must be in 0..59" ) @@ -1312,6 +1322,27 @@ def test_to_datetime_coerce(self): ) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "string_arg, format", + [("March 1, 2018", "%B %d, %Y"), ("2018-03-01", "%Y-%m-%d")], + ) + @pytest.mark.parametrize( + "outofbounds", + [ + datetime(9999, 1, 1), + date(9999, 1, 1), + np.datetime64("9999-01-01"), + "January 1, 9999", + "9999-01-01", + ], + ) + def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds): + # https://github.com/pandas-dev/pandas/issues/50255 + ts_strings = [string_arg, outofbounds] + result = to_datetime(ts_strings, errors="coerce", format=format) + expected = DatetimeIndex([datetime(2018, 3, 1), NaT]) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "errors, expected", [ From 2f9094f8e1fc5138a2be13cdb5b5e7ba58f21e40 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 16 Dec 2022 17:40:40 +0000 Subject: [PATCH 2/4] avoid large outer try-except --- pandas/_libs/tslibs/strptime.pyx | 388 ++++++++++++------------- pandas/tests/tools/test_to_datetime.py | 8 +- 2 files changed, 196 insertions(+), 200 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index fd5b262d7e3b9..dfb9b57de3b2a 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -194,198 +194,180 @@ def array_strptime( if val in nat_strings: iresult[i] = NPY_NAT continue - elif checknull_with_nat_and_na(val): - iresult[i] = NPY_NAT - continue - elif PyDateTime_Check(val): - if val.tzinfo is not None: - found_tz = True - else: - found_naive = True - tz_out = convert_timezone( - val.tzinfo, - tz_out, - found_naive, - found_tz, - utc, - ) - if isinstance(val, _Timestamp): - iresult[i] = val.tz_localize(None).as_unit("ns").value - else: - iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts) - check_dts_bounds(&dts) - result_timezone[i] = val.tzinfo - continue - elif PyDate_Check(val): - iresult[i] = pydate_to_dt64(val, &dts) - check_dts_bounds(&dts) - continue - elif is_datetime64_object(val): + elif is_raise: + raise + return values, [] + continue + elif is_datetime64_object(val): + try: iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) - continue - elif ( - (is_integer_object(val) or is_float_object(val)) - and (val != val or val == NPY_NAT) - ): - iresult[i] = NPY_NAT - continue - else: - val = str(val) - - # exact matching - if exact: - found = format_regex.match(val) - if not found: + except OutOfBoundsDatetime: + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise + return values, [] + continue + elif ( + (is_integer_object(val) or is_float_object(val)) + and (val != val or val == NPY_NAT) + ): + iresult[i] = NPY_NAT + continue + else: + val = str(val) + + # exact matching + if exact: + found = format_regex.match(val) + if not found: + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: raise ValueError(f"time data '{val}' does not match " f"format '{fmt}' (match)") - if len(val) != found.end(): + return values, [] + if len(val) != found.end(): + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: raise ValueError(f"unconverted data remains: {val[found.end():]}") + return values, [] - # search - else: - found = format_regex.search(val) - if not found: + # search + else: + found = format_regex.search(val) + if not found: + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: raise ValueError(f"time data {repr(val)} does not match format " f"{repr(fmt)} (search)") - - iso_year = -1 - year = 1900 - month = day = 1 - hour = minute = second = ns = us = 0 - tz = None - # Default to -1 to signify that values not known; not critical to have, - # though - iso_week = week_of_year = -1 - week_of_year_start = -1 - # weekday and julian defaulted to -1 so as to signal need to calculate - # values - weekday = julian = -1 - found_dict = found.groupdict() - for group_key in found_dict.iterkeys(): - # Directives not explicitly handled below: - # c, x, X - # handled by making out of other directives - # U, W - # worthless without day of the week - parse_code = _parse_code_table[group_key] - - if parse_code == 0: - year = int(found_dict["y"]) - # Open Group specification for strptime() states that a %y - # value in the range of [00, 68] is in the century 2000, while - # [69,99] is in the century 1900 - if year <= 68: - year += 2000 - else: - year += 1900 - elif parse_code == 1: - year = int(found_dict["Y"]) - elif parse_code == 2: - month = int(found_dict["m"]) - # elif group_key == 'B': - elif parse_code == 3: - month = locale_time.f_month.index(found_dict["B"].lower()) - # elif group_key == 'b': - elif parse_code == 4: - month = locale_time.a_month.index(found_dict["b"].lower()) - # elif group_key == 'd': - elif parse_code == 5: - day = int(found_dict["d"]) - # elif group_key == 'H': - elif parse_code == 6: - hour = int(found_dict["H"]) - elif parse_code == 7: - hour = int(found_dict["I"]) - ampm = found_dict.get("p", "").lower() - # If there was no AM/PM indicator, we'll treat this like AM - if ampm in ("", locale_time.am_pm[0]): - # We're in AM so the hour is correct unless we're - # looking at 12 midnight. - # 12 midnight == 12 AM == hour 0 - if hour == 12: - hour = 0 - elif ampm == locale_time.am_pm[1]: - # We're in PM so we need to add 12 to the hour unless - # we're looking at 12 noon. - # 12 noon == 12 PM == hour 12 - if hour != 12: - hour += 12 - elif parse_code == 8: - minute = int(found_dict["M"]) - elif parse_code == 9: - second = int(found_dict["S"]) - elif parse_code == 10: - s = found_dict["f"] - # Pad to always return nanoseconds - s += "0" * (9 - len(s)) - us = long(s) - ns = us % 1000 - us = us // 1000 - elif parse_code == 11: - weekday = locale_time.f_weekday.index(found_dict["A"].lower()) - elif parse_code == 12: - weekday = locale_time.a_weekday.index(found_dict["a"].lower()) - elif parse_code == 13: - weekday = int(found_dict["w"]) - if weekday == 0: - weekday = 6 - else: - weekday -= 1 - elif parse_code == 14: - julian = int(found_dict["j"]) - elif parse_code == 15 or parse_code == 16: - week_of_year = int(found_dict[group_key]) - if group_key == "U": - # U starts week on Sunday. - week_of_year_start = 6 - else: - # W starts week on Monday. - week_of_year_start = 0 - elif parse_code == 17: - tz = pytz.timezone(found_dict["Z"]) - elif parse_code == 19: - tz = parse_timezone_directive(found_dict["z"]) - elif parse_code == 20: - iso_year = int(found_dict["G"]) - elif parse_code == 21: - iso_week = int(found_dict["V"]) - elif parse_code == 22: - weekday = int(found_dict["u"]) + return values, [] + + iso_year = -1 + year = 1900 + month = day = 1 + hour = minute = second = ns = us = 0 + tz = None + # Default to -1 to signify that values not known; not critical to have, + # though + iso_week = week_of_year = -1 + week_of_year_start = -1 + # weekday and julian defaulted to -1 so as to signal need to calculate + # values + weekday = julian = -1 + found_dict = found.groupdict() + for group_key in found_dict.iterkeys(): + # Directives not explicitly handled below: + # c, x, X + # handled by making out of other directives + # U, W + # worthless without day of the week + parse_code = _parse_code_table[group_key] + + if parse_code == 0: + year = int(found_dict["y"]) + # Open Group specification for strptime() states that a %y + # value in the range of [00, 68] is in the century 2000, while + # [69,99] is in the century 1900 + if year <= 68: + year += 2000 + else: + year += 1900 + elif parse_code == 1: + year = int(found_dict["Y"]) + elif parse_code == 2: + month = int(found_dict["m"]) + # elif group_key == 'B': + elif parse_code == 3: + month = locale_time.f_month.index(found_dict["B"].lower()) + # elif group_key == 'b': + elif parse_code == 4: + month = locale_time.a_month.index(found_dict["b"].lower()) + # elif group_key == 'd': + elif parse_code == 5: + day = int(found_dict["d"]) + # elif group_key == 'H': + elif parse_code == 6: + hour = int(found_dict["H"]) + elif parse_code == 7: + hour = int(found_dict["I"]) + ampm = found_dict.get("p", "").lower() + # If there was no AM/PM indicator, we'll treat this like AM + if ampm in ("", locale_time.am_pm[0]): + # We're in AM so the hour is correct unless we're + # looking at 12 midnight. + # 12 midnight == 12 AM == hour 0 + if hour == 12: + hour = 0 + elif ampm == locale_time.am_pm[1]: + # We're in PM so we need to add 12 to the hour unless + # we're looking at 12 noon. + # 12 noon == 12 PM == hour 12 + if hour != 12: + hour += 12 + elif parse_code == 8: + minute = int(found_dict["M"]) + elif parse_code == 9: + second = int(found_dict["S"]) + elif parse_code == 10: + s = found_dict["f"] + # Pad to always return nanoseconds + s += "0" * (9 - len(s)) + us = long(s) + ns = us % 1000 + us = us // 1000 + elif parse_code == 11: + weekday = locale_time.f_weekday.index(found_dict["A"].lower()) + elif parse_code == 12: + weekday = locale_time.a_weekday.index(found_dict["a"].lower()) + elif parse_code == 13: + weekday = int(found_dict["w"]) + if weekday == 0: + weekday = 6 + else: weekday -= 1 - - # don't assume default values for ISO week/year - if iso_year != -1: - if iso_week == -1 or weekday == -1: - raise ValueError("ISO year directive '%G' must be used with " - "the ISO week directive '%V' and a weekday " - "directive '%A', '%a', '%w', or '%u'.") - if julian != -1: - raise ValueError("Day of the year directive '%j' is not " - "compatible with ISO year directive '%G'. " - "Use '%Y' instead.") - elif year != -1 and week_of_year == -1 and iso_week != -1: - if weekday == -1: - raise ValueError("ISO week directive '%V' must be used with " - "the ISO year directive '%G' and a weekday " - "directive '%A', '%a', '%w', or '%u'.") + elif parse_code == 14: + julian = int(found_dict["j"]) + elif parse_code == 15 or parse_code == 16: + week_of_year = int(found_dict[group_key]) + if group_key == "U": + # U starts week on Sunday. + week_of_year_start = 6 else: - raise ValueError("ISO week directive '%V' is incompatible with " - "the year directive '%Y'. Use the ISO year " - "'%G' instead.") - - # If we know the wk of the year and what day of that wk, we can figure - # out the Julian day of the year. - if julian == -1 and weekday != -1: - if week_of_year != -1: - week_starts_Mon = week_of_year_start == 0 - julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, - week_starts_Mon) - elif iso_year != -1 and iso_week != -1: - year, julian = _calc_julian_from_V(iso_year, iso_week, - weekday + 1) - # Cannot pre-calculate date() since can change in Julian - # calculation and thus could have different value for the day of the wk - # calculation. + # W starts week on Monday. + week_of_year_start = 0 + elif parse_code == 17: + tz = pytz.timezone(found_dict["Z"]) + elif parse_code == 19: + tz = parse_timezone_directive(found_dict["z"]) + elif parse_code == 20: + iso_year = int(found_dict["G"]) + elif parse_code == 21: + iso_week = int(found_dict["V"]) + elif parse_code == 22: + weekday = int(found_dict["u"]) + weekday -= 1 + + # If we know the wk of the year and what day of that wk, we can figure + # out the Julian day of the year. + if julian == -1 and weekday != -1: + if week_of_year != -1: + week_starts_Mon = week_of_year_start == 0 + julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, + week_starts_Mon) + elif iso_year != -1 and iso_week != -1: + year, julian = _calc_julian_from_V(iso_year, iso_week, + weekday + 1) + # Cannot pre-calculate date() since can change in Julian + # calculation and thus could have different value for the day of the wk + # calculation. + try: if julian == -1: # Need to add 1 to result since first day of the year is 1, not # 0. @@ -399,30 +381,38 @@ def array_strptime( year = datetime_result.year month = datetime_result.month day = datetime_result.day - if weekday == -1: - weekday = date(year, month, day).weekday() - - dts.year = year - dts.month = month - dts.day = day - dts.hour = hour - dts.min = minute - dts.sec = second - dts.us = us - dts.ps = ns * 1000 - - iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) + except ValueError: + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise + return values, [] + if weekday == -1: + weekday = date(year, month, day).weekday() + + dts.year = year + dts.month = month + dts.day = day + dts.hour = hour + dts.min = minute + dts.sec = second + dts.us = us + dts.ps = ns * 1000 + + iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) + try: check_dts_bounds(&dts) - result_timezone[i] = tz - except (ValueError, OutOfBoundsDatetime): + except ValueError: if is_coerce: iresult[i] = NPY_NAT continue elif is_raise: raise - # Return input unaltered if errors='ignore' return values, [] + result_timezone[i] = tz + return result, result_timezone.base diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b087ca90b18e6..bba9d0a4bc9c2 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -744,8 +744,14 @@ def test_to_datetime_iso_week_year_format(self, s, _format, dt): ], ], ) +<<<<<<< HEAD def test_error_iso_week_year(self, msg, s, _format): # See GH#16607, #50308 +======= + @pytest.mark.parametrize("errors", ["raise", "coerce", "ignore"]) + def test_error_iso_week_year(self, msg, s, _format, errors): + # See GH#16607 +>>>>>>> 22f4dde6e4 (avoid large outer try-except) # This test checks for errors thrown when giving the wrong format # However, as discussed on PR#25541, overriding the locale # causes a different error to be thrown due to the format being @@ -757,7 +763,7 @@ def test_error_iso_week_year(self, msg, s, _format): "UTF-8", ): with pytest.raises(ValueError, match=msg): - to_datetime(s, format=_format) + to_datetime(s, format=_format, errors=errors) @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_to_datetime_dtarr(self, tz): From 95222e6fb267426a60089e8f7e50303fce15b3da Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 19 Dec 2022 18:48:50 +0000 Subject: [PATCH 3/4] fixup --- pandas/_libs/tslibs/strptime.pyx | 373 ++++++++++++------------- pandas/tests/tools/test_to_datetime.py | 5 - 2 files changed, 182 insertions(+), 196 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index dfb9b57de3b2a..73e9176d3a6d2 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -191,183 +191,181 @@ def array_strptime( val = values[i] try: if isinstance(val, str): - if val in nat_strings: + if len(val) == 0 or val in nat_strings: iresult[i] = NPY_NAT continue - elif is_raise: - raise - return values, [] - continue - elif is_datetime64_object(val): - try: + elif checknull_with_nat_and_na(val): + iresult[i] = NPY_NAT + continue + elif PyDateTime_Check(val): + if val.tzinfo is not None: + found_tz = True + else: + found_naive = True + tz_out = convert_timezone( + val.tzinfo, + tz_out, + found_naive, + found_tz, + utc, + ) + if isinstance(val, _Timestamp): + iresult[i] = val.tz_localize(None).as_unit("ns").value + else: + iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts) + check_dts_bounds(&dts) + result_timezone[i] = val.tzinfo + continue + elif PyDate_Check(val): + iresult[i] = pydate_to_dt64(val, &dts) + check_dts_bounds(&dts) + continue + elif is_datetime64_object(val): iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) - except OutOfBoundsDatetime: - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: - raise - return values, [] - continue - elif ( - (is_integer_object(val) or is_float_object(val)) - and (val != val or val == NPY_NAT) - ): - iresult[i] = NPY_NAT - continue - else: - val = str(val) - - # exact matching - if exact: - found = format_regex.match(val) - if not found: - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: + continue + elif ( + (is_integer_object(val) or is_float_object(val)) + and (val != val or val == NPY_NAT) + ): + iresult[i] = NPY_NAT + continue + else: + val = str(val) + + # exact matching + if exact: + found = format_regex.match(val) + if not found: raise ValueError(f"time data '{val}' does not match " f"format '{fmt}' (match)") - return values, [] - if len(val) != found.end(): - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: + if len(val) != found.end(): raise ValueError(f"unconverted data remains: {val[found.end():]}") - return values, [] - # search - else: - found = format_regex.search(val) - if not found: - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: + # search + else: + found = format_regex.search(val) + if not found: raise ValueError(f"time data {repr(val)} does not match format " f"{repr(fmt)} (search)") - return values, [] - - iso_year = -1 - year = 1900 - month = day = 1 - hour = minute = second = ns = us = 0 - tz = None - # Default to -1 to signify that values not known; not critical to have, - # though - iso_week = week_of_year = -1 - week_of_year_start = -1 - # weekday and julian defaulted to -1 so as to signal need to calculate - # values - weekday = julian = -1 - found_dict = found.groupdict() - for group_key in found_dict.iterkeys(): - # Directives not explicitly handled below: - # c, x, X - # handled by making out of other directives - # U, W - # worthless without day of the week - parse_code = _parse_code_table[group_key] - - if parse_code == 0: - year = int(found_dict["y"]) - # Open Group specification for strptime() states that a %y - # value in the range of [00, 68] is in the century 2000, while - # [69,99] is in the century 1900 - if year <= 68: - year += 2000 - else: - year += 1900 - elif parse_code == 1: - year = int(found_dict["Y"]) - elif parse_code == 2: - month = int(found_dict["m"]) - # elif group_key == 'B': - elif parse_code == 3: - month = locale_time.f_month.index(found_dict["B"].lower()) - # elif group_key == 'b': - elif parse_code == 4: - month = locale_time.a_month.index(found_dict["b"].lower()) - # elif group_key == 'd': - elif parse_code == 5: - day = int(found_dict["d"]) - # elif group_key == 'H': - elif parse_code == 6: - hour = int(found_dict["H"]) - elif parse_code == 7: - hour = int(found_dict["I"]) - ampm = found_dict.get("p", "").lower() - # If there was no AM/PM indicator, we'll treat this like AM - if ampm in ("", locale_time.am_pm[0]): - # We're in AM so the hour is correct unless we're - # looking at 12 midnight. - # 12 midnight == 12 AM == hour 0 - if hour == 12: - hour = 0 - elif ampm == locale_time.am_pm[1]: - # We're in PM so we need to add 12 to the hour unless - # we're looking at 12 noon. - # 12 noon == 12 PM == hour 12 - if hour != 12: - hour += 12 - elif parse_code == 8: - minute = int(found_dict["M"]) - elif parse_code == 9: - second = int(found_dict["S"]) - elif parse_code == 10: - s = found_dict["f"] - # Pad to always return nanoseconds - s += "0" * (9 - len(s)) - us = long(s) - ns = us % 1000 - us = us // 1000 - elif parse_code == 11: - weekday = locale_time.f_weekday.index(found_dict["A"].lower()) - elif parse_code == 12: - weekday = locale_time.a_weekday.index(found_dict["a"].lower()) - elif parse_code == 13: - weekday = int(found_dict["w"]) - if weekday == 0: - weekday = 6 - else: + + iso_year = -1 + year = 1900 + month = day = 1 + hour = minute = second = ns = us = 0 + tz = None + # Default to -1 to signify that values not known; not critical to have, + # though + iso_week = week_of_year = -1 + week_of_year_start = -1 + # weekday and julian defaulted to -1 so as to signal need to calculate + # values + weekday = julian = -1 + found_dict = found.groupdict() + for group_key in found_dict.iterkeys(): + # Directives not explicitly handled below: + # c, x, X + # handled by making out of other directives + # U, W + # worthless without day of the week + parse_code = _parse_code_table[group_key] + + if parse_code == 0: + year = int(found_dict["y"]) + # Open Group specification for strptime() states that a %y + # value in the range of [00, 68] is in the century 2000, while + # [69,99] is in the century 1900 + if year <= 68: + year += 2000 + else: + year += 1900 + elif parse_code == 1: + year = int(found_dict["Y"]) + elif parse_code == 2: + month = int(found_dict["m"]) + # elif group_key == 'B': + elif parse_code == 3: + month = locale_time.f_month.index(found_dict["B"].lower()) + # elif group_key == 'b': + elif parse_code == 4: + month = locale_time.a_month.index(found_dict["b"].lower()) + # elif group_key == 'd': + elif parse_code == 5: + day = int(found_dict["d"]) + # elif group_key == 'H': + elif parse_code == 6: + hour = int(found_dict["H"]) + elif parse_code == 7: + hour = int(found_dict["I"]) + ampm = found_dict.get("p", "").lower() + # If there was no AM/PM indicator, we'll treat this like AM + if ampm in ("", locale_time.am_pm[0]): + # We're in AM so the hour is correct unless we're + # looking at 12 midnight. + # 12 midnight == 12 AM == hour 0 + if hour == 12: + hour = 0 + elif ampm == locale_time.am_pm[1]: + # We're in PM so we need to add 12 to the hour unless + # we're looking at 12 noon. + # 12 noon == 12 PM == hour 12 + if hour != 12: + hour += 12 + elif parse_code == 8: + minute = int(found_dict["M"]) + elif parse_code == 9: + second = int(found_dict["S"]) + elif parse_code == 10: + s = found_dict["f"] + # Pad to always return nanoseconds + s += "0" * (9 - len(s)) + us = long(s) + ns = us % 1000 + us = us // 1000 + elif parse_code == 11: + weekday = locale_time.f_weekday.index(found_dict["A"].lower()) + elif parse_code == 12: + weekday = locale_time.a_weekday.index(found_dict["a"].lower()) + elif parse_code == 13: + weekday = int(found_dict["w"]) + if weekday == 0: + weekday = 6 + else: + weekday -= 1 + elif parse_code == 14: + julian = int(found_dict["j"]) + elif parse_code == 15 or parse_code == 16: + week_of_year = int(found_dict[group_key]) + if group_key == "U": + # U starts week on Sunday. + week_of_year_start = 6 + else: + # W starts week on Monday. + week_of_year_start = 0 + elif parse_code == 17: + tz = pytz.timezone(found_dict["Z"]) + elif parse_code == 19: + tz = parse_timezone_directive(found_dict["z"]) + elif parse_code == 20: + iso_year = int(found_dict["G"]) + elif parse_code == 21: + iso_week = int(found_dict["V"]) + elif parse_code == 22: + weekday = int(found_dict["u"]) weekday -= 1 - elif parse_code == 14: - julian = int(found_dict["j"]) - elif parse_code == 15 or parse_code == 16: - week_of_year = int(found_dict[group_key]) - if group_key == "U": - # U starts week on Sunday. - week_of_year_start = 6 - else: - # W starts week on Monday. - week_of_year_start = 0 - elif parse_code == 17: - tz = pytz.timezone(found_dict["Z"]) - elif parse_code == 19: - tz = parse_timezone_directive(found_dict["z"]) - elif parse_code == 20: - iso_year = int(found_dict["G"]) - elif parse_code == 21: - iso_week = int(found_dict["V"]) - elif parse_code == 22: - weekday = int(found_dict["u"]) - weekday -= 1 - - # If we know the wk of the year and what day of that wk, we can figure - # out the Julian day of the year. - if julian == -1 and weekday != -1: - if week_of_year != -1: - week_starts_Mon = week_of_year_start == 0 - julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, - week_starts_Mon) - elif iso_year != -1 and iso_week != -1: - year, julian = _calc_julian_from_V(iso_year, iso_week, - weekday + 1) - # Cannot pre-calculate date() since can change in Julian - # calculation and thus could have different value for the day of the wk - # calculation. - try: + + # If we know the wk of the year and what day of that wk, we can figure + # out the Julian day of the year. + if julian == -1 and weekday != -1: + if week_of_year != -1: + week_starts_Mon = week_of_year_start == 0 + julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, + week_starts_Mon) + elif iso_year != -1 and iso_week != -1: + year, julian = _calc_julian_from_V(iso_year, iso_week, + weekday + 1) + # Cannot pre-calculate date() since can change in Julian + # calculation and thus could have different value for the day of the wk + # calculation. if julian == -1: # Need to add 1 to result since first day of the year is 1, not # 0. @@ -381,29 +379,24 @@ def array_strptime( year = datetime_result.year month = datetime_result.month day = datetime_result.day - except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: - raise - return values, [] - if weekday == -1: - weekday = date(year, month, day).weekday() - - dts.year = year - dts.month = month - dts.day = day - dts.hour = hour - dts.min = minute - dts.sec = second - dts.us = us - dts.ps = ns * 1000 - - iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) - try: + if weekday == -1: + weekday = date(year, month, day).weekday() + + dts.year = year + dts.month = month + dts.day = day + dts.hour = hour + dts.min = minute + dts.sec = second + dts.us = us + dts.ps = ns * 1000 + + iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) check_dts_bounds(&dts) - except ValueError: + + result_timezone[i] = tz + + except (ValueError, OutOfBoundsDatetime): if is_coerce: iresult[i] = NPY_NAT continue @@ -411,8 +404,6 @@ def array_strptime( raise return values, [] - result_timezone[i] = tz - return result, result_timezone.base diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index bba9d0a4bc9c2..c78ecb90eda0c 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -744,14 +744,9 @@ def test_to_datetime_iso_week_year_format(self, s, _format, dt): ], ], ) -<<<<<<< HEAD - def test_error_iso_week_year(self, msg, s, _format): - # See GH#16607, #50308 -======= @pytest.mark.parametrize("errors", ["raise", "coerce", "ignore"]) def test_error_iso_week_year(self, msg, s, _format, errors): # See GH#16607 ->>>>>>> 22f4dde6e4 (avoid large outer try-except) # This test checks for errors thrown when giving the wrong format # However, as discussed on PR#25541, overriding the locale # causes a different error to be thrown due to the format being From 919a4330db54484a48efe8319cd55dbf271fe3e1 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 19 Dec 2022 19:03:39 +0000 Subject: [PATCH 4/4] postmerge fixup --- pandas/tests/tools/test_to_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c78ecb90eda0c..83e40f5f1d98b 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -746,7 +746,7 @@ def test_to_datetime_iso_week_year_format(self, s, _format, dt): ) @pytest.mark.parametrize("errors", ["raise", "coerce", "ignore"]) def test_error_iso_week_year(self, msg, s, _format, errors): - # See GH#16607 + # See GH#16607, GH#50308 # This test checks for errors thrown when giving the wrong format # However, as discussed on PR#25541, overriding the locale # causes a different error to be thrown due to the format being