-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: out-of-bounds pydatetime with errors='coerce' and non-ISO format coerces all elements to NaT #50265
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: out-of-bounds pydatetime with errors='coerce' and non-ISO format coerces all elements to NaT #50265
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -744,8 +744,9 @@ def test_to_datetime_iso_week_year_format(self, s, _format, dt): | |
], | ||
], | ||
) | ||
def test_error_iso_week_year(self, msg, s, _format): | ||
# See GH#16607, #50308 | ||
@pytest.mark.parametrize("errors", ["raise", "coerce", "ignore"]) | ||
def test_error_iso_week_year(self, msg, s, _format, errors): | ||
# See GH#16607, GH#50308 | ||
# This test checks for errors thrown when giving the wrong format | ||
# However, as discussed on PR#25541, overriding the locale | ||
# causes a different error to be thrown due to the format being | ||
|
@@ -757,7 +758,7 @@ def test_error_iso_week_year(self, msg, s, _format): | |
"UTF-8", | ||
): | ||
with pytest.raises(ValueError, match=msg): | ||
to_datetime(s, format=_format) | ||
to_datetime(s, format=_format, errors=errors) | ||
|
||
@pytest.mark.parametrize("tz", [None, "US/Central"]) | ||
def test_to_datetime_dtarr(self, tz): | ||
|
@@ -1109,9 +1110,17 @@ def test_datetime_invalid_datatype(self, arg): | |
with pytest.raises(TypeError, match=msg): | ||
to_datetime(arg) | ||
|
||
@pytest.mark.parametrize("errors", ["coerce", "raise", "ignore"]) | ||
def test_invalid_format_raises(self, errors): | ||
# https://github.com/pandas-dev/pandas/issues/50255 | ||
with pytest.raises( | ||
ValueError, match="':' is a bad directive in format 'H%:M%:S%" | ||
): | ||
to_datetime(["00:00:00"], format="H%:M%:S%", errors=errors) | ||
|
||
@pytest.mark.parametrize("value", ["a", "00:01:99"]) | ||
@pytest.mark.parametrize( | ||
"format,warning", [(None, UserWarning), ("H%:M%:S%", None)] | ||
"format,warning", [(None, UserWarning), ("%H:%M:%S", None)] | ||
) | ||
def test_datetime_invalid_scalar(self, value, format, warning): | ||
Comment on lines
-1114
to
1125
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the test claims to be testing invalid scalars, but in reality it was erroring before the parsing even started because So, I've changed the format to reflect what it looks like this test was meant to be testing, and have made a separate test above to check that invalid formats raise |
||
# GH24763 | ||
|
@@ -1124,7 +1133,8 @@ def test_datetime_invalid_scalar(self, value, format, warning): | |
assert res is NaT | ||
|
||
msg = ( | ||
"is a bad directive in format|" | ||
"does not match format|" | ||
"unconverted data remains:|" | ||
"second must be in 0..59|" | ||
f"Given date string {value} not likely a datetime" | ||
) | ||
|
@@ -1134,7 +1144,7 @@ def test_datetime_invalid_scalar(self, value, format, warning): | |
|
||
@pytest.mark.parametrize("value", ["3000/12/11 00:00:00"]) | ||
@pytest.mark.parametrize( | ||
"format,warning", [(None, UserWarning), ("H%:M%:S%", None)] | ||
"format,warning", [(None, UserWarning), ("%H:%M:%S", None)] | ||
) | ||
def test_datetime_outofbounds_scalar(self, value, format, warning): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as above (this errors before even reaching the out-of-bounds scalar...) |
||
# GH24763 | ||
|
@@ -1147,7 +1157,7 @@ def test_datetime_outofbounds_scalar(self, value, format, warning): | |
assert res is NaT | ||
|
||
if format is not None: | ||
msg = "is a bad directive in format|Out of bounds .* present at position 0" | ||
msg = "does not match format|Out of bounds .* present at position 0" | ||
with pytest.raises(ValueError, match=msg): | ||
to_datetime(value, errors="raise", format=format) | ||
else: | ||
|
@@ -1159,7 +1169,7 @@ def test_datetime_outofbounds_scalar(self, value, format, warning): | |
|
||
@pytest.mark.parametrize("values", [["a"], ["00:01:99"], ["a", "b", "99:00:00"]]) | ||
@pytest.mark.parametrize( | ||
"format,warning", [(None, UserWarning), ("H%:M%:S%", None)] | ||
"format,warning", [(None, UserWarning), ("%H:%M:%S", None)] | ||
) | ||
MarcoGorelli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def test_datetime_invalid_index(self, values, format, warning): | ||
# GH24763 | ||
|
@@ -1172,7 +1182,8 @@ def test_datetime_invalid_index(self, values, format, warning): | |
tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) | ||
|
||
msg = ( | ||
"is a bad directive in format|" | ||
"does not match format|" | ||
"unconverted data remains:|" | ||
f"Given date string {values[0]} not likely a datetime|" | ||
"second must be in 0..59" | ||
) | ||
|
@@ -1312,6 +1323,27 @@ def test_to_datetime_coerce(self): | |
) | ||
tm.assert_index_equal(result, expected) | ||
|
||
@pytest.mark.parametrize( | ||
"string_arg, format", | ||
[("March 1, 2018", "%B %d, %Y"), ("2018-03-01", "%Y-%m-%d")], | ||
) | ||
@pytest.mark.parametrize( | ||
"outofbounds", | ||
[ | ||
datetime(9999, 1, 1), | ||
date(9999, 1, 1), | ||
np.datetime64("9999-01-01"), | ||
"January 1, 9999", | ||
"9999-01-01", | ||
], | ||
) | ||
def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds): | ||
# https://github.com/pandas-dev/pandas/issues/50255 | ||
ts_strings = [string_arg, outofbounds] | ||
result = to_datetime(ts_strings, errors="coerce", format=format) | ||
expected = DatetimeIndex([datetime(2018, 3, 1), NaT]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
@pytest.mark.parametrize( | ||
"errors, expected", | ||
[ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is the code inside this try/except just indenting whats currently there or are there other changes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's basically just that, but I also removed some
as that already happens in the outer try-except
If you hide whitespace it'll be easier to see - you can also go to https://github.com/pandas-dev/pandas/pull/50265/files?diff=unified&w=1 (notice the
?diff=unified&w=1
at the end) to see itThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Generally I'm not a big fan of try/excepts with a huge block of code inside, but I guess since the failures modes here should be made consistent with the broader
to_datetime(errors=)
keyword I suppose this is fine.Question, sometime we are issues some
ValueError
s internally in this function that will be caught by this try except e.g.Is this necessarily a "parsing" error that should be influenced by the errors keyword?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can rewrite it such that each
check_dts_bounds
is within its own try-except - maybe that's better actually, thanksI'll get back to you on that ValueError
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, refactored
That's a really good point about the "ISO year directive" error, thanks, good catch! So, I think that one should always raise. At least, that seems to be the current intention, there's no
if is_coerce
check there like there is for some other errors, such aspandas/pandas/_libs/tslibs/strptime.pyx
Lines 378 to 382 in a5cbd1e
So, I've parametrised
test_error_iso_week_year
too overerrors
to check that this one actually raisesThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not actually opposed to the large try/except block if we expect ValueError and OutofBounds Errors to be affected by
errors
. If other excepts in this functions can be factored out (like the iso year directive one) and others then it would help ensure errors is consistently handledThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK thanks - have factored out the iso year directive check in #50309, so I've gone back to having an outer try-except block (so again, "hide-whitespace" will come in handy when reviewing)