Skip to content

ERR non-ISO formats don't show position of error #50366

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ Other enhancements
- Added :meth:`Index.infer_objects` analogous to :meth:`Series.infer_objects` (:issue:`50034`)
- Added ``copy`` parameter to :meth:`Series.infer_objects` and :meth:`DataFrame.infer_objects`, passing ``False`` will avoid making copies for series or columns that are already non-object or where no better dtype can be inferred (:issue:`50096`)
- :meth:`DataFrame.plot.hist` now recognizes ``xlabel`` and ``ylabel`` arguments (:issue:`49793`)
- Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`)
-

.. ---------------------------------------------------------------------------
Expand Down
6 changes: 4 additions & 2 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -515,9 +515,10 @@ cpdef array_to_datetime(
iresult[i] = NPY_NAT
continue
elif is_raise:
match_msg = "match" if exact else "search"
raise ValueError(
f"time data \"{val}\" at position {i} doesn't "
f"match format \"{format}\""
f"match format \"{format}\" ({match_msg})"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ending this with "search" seems weird (and i dont see it in any test cases)

Copy link
Member Author

@MarcoGorelli MarcoGorelli Dec 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's true that it wasn't tested, but it's been like this since pandas 0.24:

In [1]: import pandas

In [2]: pandas.__version__
Out[2]: '0.24.0'

In [3]: pandas.to_datetime(['2020-01-f00'], format='%Y-%d-%m', exact=False)
---------------------------------------------------------------------------

ValueError: time data '2020-01-f00' does not match format '%Y-%d-%m' (search)

Maybe we can just remove it - people typically know whether they've passed exact, and it doesn't really add much

Especially as it was never there for (more common) ISO formats:

In [4]: pandas.to_datetime(['2020-01-f00'], format='%Y-%m-%d', exact=False)
---------------------------------------------------------------------------
ValueError: time data 2020-01-f00 doesn't match format specified

Let's just unify them to

time data '2020-01-f00' does not match format '%Y-%d-%m'

)
return values, tz_out
# these must be ns unit by-definition
Expand Down Expand Up @@ -564,9 +565,10 @@ cpdef array_to_datetime(
iresult[i] = NPY_NAT
continue
elif is_raise:
match_msg = "match" if exact else "search"
raise ValueError(
f"time data \"{val}\" at position {i} doesn't "
f"match format \"{format}\""
f"match format \"{format}\" ({match_msg})"
)
return values, tz_out

Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def parse_datetime_string(
datetime dt

if not _does_string_look_like_datetime(date_string):
raise ValueError(f"Given date string {date_string} not likely a datetime")
raise ValueError(f'Given date string "{date_string}" not likely a datetime')

if does_string_look_like_time(date_string):
# use current datetime as default, not pass _DEFAULT_DATETIME
Expand Down Expand Up @@ -297,7 +297,7 @@ def parse_datetime_string(
except TypeError:
# following may be raised from dateutil
# TypeError: 'NoneType' object is not iterable
raise ValueError(f"Given date string {date_string} not likely a datetime")
raise ValueError(f'Given date string "{date_string}" not likely a datetime')

return dt

Expand Down Expand Up @@ -373,7 +373,7 @@ cdef parse_datetime_string_with_reso(
int out_tzoffset

if not _does_string_look_like_datetime(date_string):
raise ValueError(f"Given date string {date_string} not likely a datetime")
raise ValueError(f'Given date string "{date_string}" not likely a datetime')

parsed, reso = _parse_delimited_date(date_string, dayfirst)
if parsed is not None:
Expand Down
25 changes: 19 additions & 6 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -236,17 +236,22 @@ def array_strptime(
if exact:
found = format_regex.match(val)
if not found:
raise ValueError(f"time data '{val}' does not match "
f"format '{fmt}' (match)")
raise ValueError(f"time data \"{val}\" at position {i} doesn't "
f"match format \"{fmt}\" (match)")
if len(val) != found.end():
raise ValueError(f"unconverted data remains: {val[found.end():]}")
raise ValueError(
f"unconverted data remains at position {i}: "
f'"{val[found.end():]}"'
)

# search
else:
found = format_regex.search(val)
if not found:
raise ValueError(f"time data {repr(val)} does not match format "
f"{repr(fmt)} (search)")
raise ValueError(
f"time data \"{val}\" at position {i} doesn't match "
f"format \"{fmt}\" (search)"
)

iso_year = -1
year = 1900
Expand Down Expand Up @@ -396,7 +401,15 @@ def array_strptime(

result_timezone[i] = tz

except (ValueError, OutOfBoundsDatetime):
except OutOfBoundsDatetime as ex:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than breaking off this branch can we not keep the existing one and just specialize for the OOB exception within?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're right, that's better - thanks!

ex.args = (f"{str(ex)} present at position {i}",)
if is_coerce:
iresult[i] = NPY_NAT
continue
elif is_raise:
raise
return values, []
except ValueError:
if is_coerce:
iresult[i] = NPY_NAT
continue
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1717,7 +1717,10 @@ def test_parse_multiple_delimited_dates_with_swap_warnings():
# GH46210
with pytest.raises(
ValueError,
match=r"^time data '31/05/2000' does not match format '%m/%d/%Y' \(match\)$",
match=(
r'^time data "31/05/2000" at position 1 doesn\'t match format "%m/%d/%Y" '
r"\(match\)$"
),
):
pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"])

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/scalar/period/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def test_invalid_arguments(self):
with pytest.raises(ValueError, match=msg):
Period(month=1)

msg = "Given date string -2000 not likely a datetime"
msg = '^Given date string "-2000" not likely a datetime$'
with pytest.raises(ValueError, match=msg):
Period("-2000", "A")
msg = "day is out of range for month"
Expand Down
65 changes: 44 additions & 21 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,10 @@ def test_to_datetime_parse_timezone_malformed(self, offset):
fmt = "%Y-%m-%d %H:%M:%S %z"
date = "2010-01-01 12:00:00 " + offset

msg = "does not match format|unconverted data remains"
msg = (
r'^(time data ".*" at position 0 doesn\'t match format ".*" \(match\)|'
r'unconverted data remains at position 0: ".*")$'
)
with pytest.raises(ValueError, match=msg):
to_datetime([date], format=fmt)

Expand Down Expand Up @@ -1093,7 +1096,10 @@ def test_datetime_bool_arrays_mixed(self, cache):
to_datetime([False, datetime.today()], cache=cache)
with pytest.raises(
ValueError,
match=r"^time data 'True' does not match format '%Y%m%d' \(match\)$",
match=(
r'^time data "True" at position 1 doesn\'t match format "%Y%m%d" '
r"\(match\)$"
),
):
to_datetime(["20130101", True], cache=cache)
tm.assert_index_equal(
Expand Down Expand Up @@ -1133,10 +1139,10 @@ def test_datetime_invalid_scalar(self, value, format, warning):
assert res is NaT

msg = (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick can you use msg = "|".join(...) instead of a single string

"does not match format|"
"unconverted data remains:|"
"second must be in 0..59|"
f"Given date string {value} not likely a datetime"
r'^(time data "a" at position 0 doesn\'t match format "%H:%M:%S" \(match\)|'
r'Given date string "a" not likely a datetime present at position 0|'
r'unconverted data remains at position 0: "9"|'
r"second must be in 0..59: 00:01:99 present at position 0)$"
)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(warning, match="Could not infer format"):
Expand All @@ -1157,7 +1163,7 @@ def test_datetime_outofbounds_scalar(self, value, format, warning):
assert res is NaT

if format is not None:
msg = "does not match format|Out of bounds .* present at position 0"
msg = r'^time data ".*" at position 0 doesn\'t match format ".*" \(match\)$'
with pytest.raises(ValueError, match=msg):
to_datetime(value, errors="raise", format=format)
else:
Expand All @@ -1182,10 +1188,10 @@ def test_datetime_invalid_index(self, values, format, warning):
tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values)))

msg = (
"does not match format|"
"unconverted data remains:|"
f"Given date string {values[0]} not likely a datetime|"
"second must be in 0..59"
r'^(Given date string "a" not likely a datetime present at position 0|'
r'time data "a" at position 0 doesn\'t match format "%H:%M:%S" \(match\)|'
r'unconverted data remains at position 0: "9"|'
r"second must be in 0..59: 00:01:99 present at position 0)$"
)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(warning, match="Could not infer format"):
Expand Down Expand Up @@ -1805,8 +1811,8 @@ def test_dataframe_coerce(self, cache):
df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]})

msg = (
"cannot assemble the datetimes: time data .+ does not "
r"match format '%Y%m%d' \(match\)"
r'^cannot assemble the datetimes: time data ".+" at position 1 doesn\'t '
r'match format "%Y%m%d" \(match\)$'
)
with pytest.raises(ValueError, match=msg):
to_datetime(df2, cache=cache)
Expand Down Expand Up @@ -1882,7 +1888,10 @@ def test_dataframe_mixed(self, cache):
def test_dataframe_float(self, cache):
# float
df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]})
msg = "cannot assemble the datetimes: unconverted data remains: 1"
msg = (
r"^cannot assemble the datetimes: unconverted data remains at position "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NBD but is the "r" necessary?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't be, it's just a habit to always use that in regular expressions

r'0: "1"$'
)
with pytest.raises(ValueError, match=msg):
to_datetime(df, cache=cache)

Expand Down Expand Up @@ -2072,7 +2081,9 @@ def test_to_datetime_on_datetime64_series(self, cache):
def test_to_datetime_with_space_in_series(self, cache):
# GH 6428
ser = Series(["10/18/2006", "10/18/2008", " "])
msg = r"^time data ' ' does not match format '%m/%d/%Y' \(match\)$"
msg = (
r'^time data " " at position 2 doesn\'t match format "%m/%d/%Y" \(match\)$'
)
with pytest.raises(ValueError, match=msg):
to_datetime(ser, errors="raise", cache=cache)
result_coerce = to_datetime(ser, errors="coerce", cache=cache)
Expand Down Expand Up @@ -2342,7 +2353,10 @@ def test_dayfirst_warnings_invalid_input(self):

with pytest.raises(
ValueError,
match=r"time data '03/30/2011' does not match format '%d/%m/%Y' \(match\)$",
match=(
r'^time data "03/30/2011" at position 1 doesn\'t match format '
r'"%d/%m/%Y" \(match\)$'
),
):
to_datetime(arr, dayfirst=True)

Expand Down Expand Up @@ -2410,7 +2424,11 @@ def test_to_datetime_infer_datetime_format_consistent_format(
def test_to_datetime_inconsistent_format(self, cache):
data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"]
ser = Series(np.array(data))
with pytest.raises(ValueError, match="does not match format"):
msg = (
r'^time data "01-02-2011 00:00:00" at position 1 doesn\'t match format '
r'"%m/%d/%Y %H:%M:%S" \(match\)$'
)
with pytest.raises(ValueError, match=msg):
to_datetime(ser, cache=cache)

def test_to_datetime_consistent_format(self, cache):
Expand Down Expand Up @@ -2923,17 +2941,22 @@ def test_incorrect_value_exception(self):
to_datetime(["today", "yesterday"])

@pytest.mark.parametrize(
"format, warning", [(None, UserWarning), ("%Y-%m-%d %H:%M:%S", None)]
"format, warning",
[
(None, UserWarning),
("%Y-%m-%d %H:%M:%S", None),
("%Y-%d-%m %H:%M:%S", None),
],
)
def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning):
# see gh-23830
msg = (
"Out of bounds nanosecond timestamp: 2417-10-27 00:00:00 "
"present at position 0"
r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00 "
r"present at position 0$"
)
with pytest.raises(OutOfBoundsDatetime, match=msg):
with tm.assert_produces_warning(warning, match="Could not infer format"):
to_datetime("2417-10-27 00:00:00", format=format)
to_datetime("2417-10-10 00:00:00", format=format)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

slightly changing the input here so we can parametrise of it with both "%Y-%m-%d %H:%M:%S" and "%Y-%d-%m %H:%M:%S" (to check ISO vs non-ISO)


@pytest.mark.parametrize(
"arg, origin, expected_str",
Expand Down