diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 4425565fbcc8a..9c5656a1e68fd 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -794,6 +794,7 @@ Datetimelike - Bug in :func:`to_datetime` was raising ``ValueError`` when parsing empty string and non-ISO8601 format was passed. Now, empty strings will be parsed as :class:`NaT`, for compatibility with how is done for ISO8601 formats (:issue:`50251`) - Bug in :class:`Timestamp` was showing ``UserWarning``, which was not actionable by users, when parsing non-ISO8601 delimited date strings (:issue:`50232`) - Bug in :func:`to_datetime` was showing misleading ``ValueError`` when parsing dates with format containing ISO week directive and ISO weekday directive (:issue:`50308`) +- Bug in :func:`to_datetime` was not raising ``ValueError`` when invalid format was passed and ``errors`` was ``'ignore'`` or ``'coerce'`` (:issue:`50266`) - Timedelta diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 53330e806215c..73e9176d3a6d2 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -41,6 +41,7 @@ from pandas._libs.tslibs.np_datetime cimport ( pydate_to_dt64, pydatetime_to_dt64, ) +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.util cimport ( is_datetime64_object, @@ -188,192 +189,183 @@ def array_strptime( for i in range(n): val = values[i] - if isinstance(val, str): - if len(val) == 0 or val in nat_strings: - iresult[i] = NPY_NAT - continue - elif checknull_with_nat_and_na(val): - iresult[i] = NPY_NAT - continue - elif PyDateTime_Check(val): - if val.tzinfo is not None: - found_tz = True - else: - found_naive = True - tz_out = convert_timezone( - val.tzinfo, - tz_out, - found_naive, - found_tz, - utc, - ) - if isinstance(val, _Timestamp): - iresult[i] = val.tz_localize(None).as_unit("ns").value - else: - iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts) - check_dts_bounds(&dts) - result_timezone[i] = val.tzinfo - continue - elif PyDate_Check(val): - iresult[i] = pydate_to_dt64(val, &dts) - check_dts_bounds(&dts) - continue - elif is_datetime64_object(val): - iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) - continue - elif ( - (is_integer_object(val) or is_float_object(val)) - and (val != val or val == NPY_NAT) - ): - iresult[i] = NPY_NAT - continue - else: - val = str(val) - - # exact matching - if exact: - found = format_regex.match(val) - if not found: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError(f"time data '{val}' does not match " - f"format '{fmt}' (match)") - if len(val) != found.end(): - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError(f"unconverted data remains: {val[found.end():]}") - - # search - else: - found = format_regex.search(val) - if not found: - if is_coerce: + try: + if isinstance(val, str): + if len(val) == 0 or val in nat_strings: iresult[i] = NPY_NAT continue - raise ValueError(f"time data {repr(val)} does not match format " - f"{repr(fmt)} (search)") - - iso_year = -1 - year = 1900 - month = day = 1 - hour = minute = second = ns = us = 0 - tz = None - # Default to -1 to signify that values not known; not critical to have, - # though - iso_week = week_of_year = -1 - week_of_year_start = -1 - # weekday and julian defaulted to -1 so as to signal need to calculate - # values - weekday = julian = -1 - found_dict = found.groupdict() - for group_key in found_dict.iterkeys(): - # Directives not explicitly handled below: - # c, x, X - # handled by making out of other directives - # U, W - # worthless without day of the week - parse_code = _parse_code_table[group_key] - - if parse_code == 0: - year = int(found_dict["y"]) - # Open Group specification for strptime() states that a %y - # value in the range of [00, 68] is in the century 2000, while - # [69,99] is in the century 1900 - if year <= 68: - year += 2000 + elif checknull_with_nat_and_na(val): + iresult[i] = NPY_NAT + continue + elif PyDateTime_Check(val): + if val.tzinfo is not None: + found_tz = True else: - year += 1900 - elif parse_code == 1: - year = int(found_dict["Y"]) - elif parse_code == 2: - month = int(found_dict["m"]) - # elif group_key == 'B': - elif parse_code == 3: - month = locale_time.f_month.index(found_dict["B"].lower()) - # elif group_key == 'b': - elif parse_code == 4: - month = locale_time.a_month.index(found_dict["b"].lower()) - # elif group_key == 'd': - elif parse_code == 5: - day = int(found_dict["d"]) - # elif group_key == 'H': - elif parse_code == 6: - hour = int(found_dict["H"]) - elif parse_code == 7: - hour = int(found_dict["I"]) - ampm = found_dict.get("p", "").lower() - # If there was no AM/PM indicator, we'll treat this like AM - if ampm in ("", locale_time.am_pm[0]): - # We're in AM so the hour is correct unless we're - # looking at 12 midnight. - # 12 midnight == 12 AM == hour 0 - if hour == 12: - hour = 0 - elif ampm == locale_time.am_pm[1]: - # We're in PM so we need to add 12 to the hour unless - # we're looking at 12 noon. - # 12 noon == 12 PM == hour 12 - if hour != 12: - hour += 12 - elif parse_code == 8: - minute = int(found_dict["M"]) - elif parse_code == 9: - second = int(found_dict["S"]) - elif parse_code == 10: - s = found_dict["f"] - # Pad to always return nanoseconds - s += "0" * (9 - len(s)) - us = long(s) - ns = us % 1000 - us = us // 1000 - elif parse_code == 11: - weekday = locale_time.f_weekday.index(found_dict["A"].lower()) - elif parse_code == 12: - weekday = locale_time.a_weekday.index(found_dict["a"].lower()) - elif parse_code == 13: - weekday = int(found_dict["w"]) - if weekday == 0: - weekday = 6 + found_naive = True + tz_out = convert_timezone( + val.tzinfo, + tz_out, + found_naive, + found_tz, + utc, + ) + if isinstance(val, _Timestamp): + iresult[i] = val.tz_localize(None).as_unit("ns").value else: + iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts) + check_dts_bounds(&dts) + result_timezone[i] = val.tzinfo + continue + elif PyDate_Check(val): + iresult[i] = pydate_to_dt64(val, &dts) + check_dts_bounds(&dts) + continue + elif is_datetime64_object(val): + iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) + continue + elif ( + (is_integer_object(val) or is_float_object(val)) + and (val != val or val == NPY_NAT) + ): + iresult[i] = NPY_NAT + continue + else: + val = str(val) + + # exact matching + if exact: + found = format_regex.match(val) + if not found: + raise ValueError(f"time data '{val}' does not match " + f"format '{fmt}' (match)") + if len(val) != found.end(): + raise ValueError(f"unconverted data remains: {val[found.end():]}") + + # search + else: + found = format_regex.search(val) + if not found: + raise ValueError(f"time data {repr(val)} does not match format " + f"{repr(fmt)} (search)") + + iso_year = -1 + year = 1900 + month = day = 1 + hour = minute = second = ns = us = 0 + tz = None + # Default to -1 to signify that values not known; not critical to have, + # though + iso_week = week_of_year = -1 + week_of_year_start = -1 + # weekday and julian defaulted to -1 so as to signal need to calculate + # values + weekday = julian = -1 + found_dict = found.groupdict() + for group_key in found_dict.iterkeys(): + # Directives not explicitly handled below: + # c, x, X + # handled by making out of other directives + # U, W + # worthless without day of the week + parse_code = _parse_code_table[group_key] + + if parse_code == 0: + year = int(found_dict["y"]) + # Open Group specification for strptime() states that a %y + # value in the range of [00, 68] is in the century 2000, while + # [69,99] is in the century 1900 + if year <= 68: + year += 2000 + else: + year += 1900 + elif parse_code == 1: + year = int(found_dict["Y"]) + elif parse_code == 2: + month = int(found_dict["m"]) + # elif group_key == 'B': + elif parse_code == 3: + month = locale_time.f_month.index(found_dict["B"].lower()) + # elif group_key == 'b': + elif parse_code == 4: + month = locale_time.a_month.index(found_dict["b"].lower()) + # elif group_key == 'd': + elif parse_code == 5: + day = int(found_dict["d"]) + # elif group_key == 'H': + elif parse_code == 6: + hour = int(found_dict["H"]) + elif parse_code == 7: + hour = int(found_dict["I"]) + ampm = found_dict.get("p", "").lower() + # If there was no AM/PM indicator, we'll treat this like AM + if ampm in ("", locale_time.am_pm[0]): + # We're in AM so the hour is correct unless we're + # looking at 12 midnight. + # 12 midnight == 12 AM == hour 0 + if hour == 12: + hour = 0 + elif ampm == locale_time.am_pm[1]: + # We're in PM so we need to add 12 to the hour unless + # we're looking at 12 noon. + # 12 noon == 12 PM == hour 12 + if hour != 12: + hour += 12 + elif parse_code == 8: + minute = int(found_dict["M"]) + elif parse_code == 9: + second = int(found_dict["S"]) + elif parse_code == 10: + s = found_dict["f"] + # Pad to always return nanoseconds + s += "0" * (9 - len(s)) + us = long(s) + ns = us % 1000 + us = us // 1000 + elif parse_code == 11: + weekday = locale_time.f_weekday.index(found_dict["A"].lower()) + elif parse_code == 12: + weekday = locale_time.a_weekday.index(found_dict["a"].lower()) + elif parse_code == 13: + weekday = int(found_dict["w"]) + if weekday == 0: + weekday = 6 + else: + weekday -= 1 + elif parse_code == 14: + julian = int(found_dict["j"]) + elif parse_code == 15 or parse_code == 16: + week_of_year = int(found_dict[group_key]) + if group_key == "U": + # U starts week on Sunday. + week_of_year_start = 6 + else: + # W starts week on Monday. + week_of_year_start = 0 + elif parse_code == 17: + tz = pytz.timezone(found_dict["Z"]) + elif parse_code == 19: + tz = parse_timezone_directive(found_dict["z"]) + elif parse_code == 20: + iso_year = int(found_dict["G"]) + elif parse_code == 21: + iso_week = int(found_dict["V"]) + elif parse_code == 22: + weekday = int(found_dict["u"]) weekday -= 1 - elif parse_code == 14: - julian = int(found_dict["j"]) - elif parse_code == 15 or parse_code == 16: - week_of_year = int(found_dict[group_key]) - if group_key == "U": - # U starts week on Sunday. - week_of_year_start = 6 - else: - # W starts week on Monday. - week_of_year_start = 0 - elif parse_code == 17: - tz = pytz.timezone(found_dict["Z"]) - elif parse_code == 19: - tz = parse_timezone_directive(found_dict["z"]) - elif parse_code == 20: - iso_year = int(found_dict["G"]) - elif parse_code == 21: - iso_week = int(found_dict["V"]) - elif parse_code == 22: - weekday = int(found_dict["u"]) - weekday -= 1 - - # If we know the wk of the year and what day of that wk, we can figure - # out the Julian day of the year. - if julian == -1 and weekday != -1: - if week_of_year != -1: - week_starts_Mon = week_of_year_start == 0 - julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, - week_starts_Mon) - elif iso_year != -1 and iso_week != -1: - year, julian = _calc_julian_from_V(iso_year, iso_week, - weekday + 1) - # Cannot pre-calculate date() since can change in Julian - # calculation and thus could have different value for the day of the wk - # calculation. - try: + + # If we know the wk of the year and what day of that wk, we can figure + # out the Julian day of the year. + if julian == -1 and weekday != -1: + if week_of_year != -1: + week_starts_Mon = week_of_year_start == 0 + julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, + week_starts_Mon) + elif iso_year != -1 and iso_week != -1: + year, julian = _calc_julian_from_V(iso_year, iso_week, + weekday + 1) + # Cannot pre-calculate date() since can change in Julian + # calculation and thus could have different value for the day of the wk + # calculation. if julian == -1: # Need to add 1 to result since first day of the year is 1, not # 0. @@ -387,33 +379,30 @@ def array_strptime( year = datetime_result.year month = datetime_result.month day = datetime_result.day - except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - if weekday == -1: - weekday = date(year, month, day).weekday() - - dts.year = year - dts.month = month - dts.day = day - dts.hour = hour - dts.min = minute - dts.sec = second - dts.us = us - dts.ps = ns * 1000 - - iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) - try: + if weekday == -1: + weekday = date(year, month, day).weekday() + + dts.year = year + dts.month = month + dts.day = day + dts.hour = hour + dts.min = minute + dts.sec = second + dts.us = us + dts.ps = ns * 1000 + + iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) check_dts_bounds(&dts) - except ValueError: + + result_timezone[i] = tz + + except (ValueError, OutOfBoundsDatetime): if is_coerce: iresult[i] = NPY_NAT continue - raise - - result_timezone[i] = tz + elif is_raise: + raise + return values, [] return result, result_timezone.base diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 885eb6286393b..a97a866a8406e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -483,31 +483,9 @@ def _array_strptime_with_fallback( """ Call array_strptime, with fallback behavior depending on 'errors'. """ - try: - result, timezones = array_strptime( - arg, fmt, exact=exact, errors=errors, utc=utc - ) - except OutOfBoundsDatetime: - if errors == "raise": - raise - if errors == "coerce": - result = np.empty(arg.shape, dtype="M8[ns]") - iresult = result.view("i8") - iresult.fill(iNaT) - else: - result = arg - except ValueError: - if errors == "raise": - raise - if errors == "coerce": - result = np.empty(arg.shape, dtype="M8[ns]") - iresult = result.view("i8") - iresult.fill(iNaT) - else: - result = arg - else: - if any(tz is not None for tz in timezones): - return _return_parsed_timezone_results(result, timezones, utc, name) + result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) + if any(tz is not None for tz in timezones): + return _return_parsed_timezone_results(result, timezones, utc, name) return _box_as_indexlike(result, utc=utc, name=name) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index db5a968458044..83e40f5f1d98b 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -744,8 +744,9 @@ def test_to_datetime_iso_week_year_format(self, s, _format, dt): ], ], ) - def test_error_iso_week_year(self, msg, s, _format): - # See GH#16607, #50308 + @pytest.mark.parametrize("errors", ["raise", "coerce", "ignore"]) + def test_error_iso_week_year(self, msg, s, _format, errors): + # See GH#16607, GH#50308 # This test checks for errors thrown when giving the wrong format # However, as discussed on PR#25541, overriding the locale # causes a different error to be thrown due to the format being @@ -757,7 +758,7 @@ def test_error_iso_week_year(self, msg, s, _format): "UTF-8", ): with pytest.raises(ValueError, match=msg): - to_datetime(s, format=_format) + to_datetime(s, format=_format, errors=errors) @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_to_datetime_dtarr(self, tz): @@ -1109,9 +1110,17 @@ def test_datetime_invalid_datatype(self, arg): with pytest.raises(TypeError, match=msg): to_datetime(arg) + @pytest.mark.parametrize("errors", ["coerce", "raise", "ignore"]) + def test_invalid_format_raises(self, errors): + # https://github.com/pandas-dev/pandas/issues/50255 + with pytest.raises( + ValueError, match="':' is a bad directive in format 'H%:M%:S%" + ): + to_datetime(["00:00:00"], format="H%:M%:S%", errors=errors) + @pytest.mark.parametrize("value", ["a", "00:01:99"]) @pytest.mark.parametrize( - "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + "format,warning", [(None, UserWarning), ("%H:%M:%S", None)] ) def test_datetime_invalid_scalar(self, value, format, warning): # GH24763 @@ -1124,7 +1133,8 @@ def test_datetime_invalid_scalar(self, value, format, warning): assert res is NaT msg = ( - "is a bad directive in format|" + "does not match format|" + "unconverted data remains:|" "second must be in 0..59|" f"Given date string {value} not likely a datetime" ) @@ -1134,7 +1144,7 @@ def test_datetime_invalid_scalar(self, value, format, warning): @pytest.mark.parametrize("value", ["3000/12/11 00:00:00"]) @pytest.mark.parametrize( - "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + "format,warning", [(None, UserWarning), ("%H:%M:%S", None)] ) def test_datetime_outofbounds_scalar(self, value, format, warning): # GH24763 @@ -1147,7 +1157,7 @@ def test_datetime_outofbounds_scalar(self, value, format, warning): assert res is NaT if format is not None: - msg = "is a bad directive in format|Out of bounds .* present at position 0" + msg = "does not match format|Out of bounds .* present at position 0" with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: @@ -1159,7 +1169,7 @@ def test_datetime_outofbounds_scalar(self, value, format, warning): @pytest.mark.parametrize("values", [["a"], ["00:01:99"], ["a", "b", "99:00:00"]]) @pytest.mark.parametrize( - "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + "format,warning", [(None, UserWarning), ("%H:%M:%S", None)] ) def test_datetime_invalid_index(self, values, format, warning): # GH24763 @@ -1172,7 +1182,8 @@ def test_datetime_invalid_index(self, values, format, warning): tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) msg = ( - "is a bad directive in format|" + "does not match format|" + "unconverted data remains:|" f"Given date string {values[0]} not likely a datetime|" "second must be in 0..59" ) @@ -1312,6 +1323,27 @@ def test_to_datetime_coerce(self): ) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "string_arg, format", + [("March 1, 2018", "%B %d, %Y"), ("2018-03-01", "%Y-%m-%d")], + ) + @pytest.mark.parametrize( + "outofbounds", + [ + datetime(9999, 1, 1), + date(9999, 1, 1), + np.datetime64("9999-01-01"), + "January 1, 9999", + "9999-01-01", + ], + ) + def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds): + # https://github.com/pandas-dev/pandas/issues/50255 + ts_strings = [string_arg, outofbounds] + result = to_datetime(ts_strings, errors="coerce", format=format) + expected = DatetimeIndex([datetime(2018, 3, 1), NaT]) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "errors, expected", [