Skip to content

BUG: Fix for pandas.to_datetime reports incorrect index when failing #59594

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ Datetimelike
- Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`)
- Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`)
- Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`)
- Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`)
- Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)

Timedelta
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ cpdef array_to_datetime(
raise TypeError(f"{type(val)} is not convertible to datetime")

except (TypeError, OverflowError, ValueError) as ex:
ex.args = (f"{ex}, at position {i}",)
ex.args = (f"{ex}",)
if is_coerce:
iresult[i] = NPY_NAT
continue
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,7 @@ def array_strptime(

except ValueError as ex:
ex.args = (
f"{str(ex)}, at position {i}. You might want to try:\n"
f"{str(ex)}. You might want to try:\n"
" - passing `format` if your strings have a consistent format;\n"
" - passing `format='ISO8601'` if your strings are "
"all ISO8601 but not necessarily in exactly the same format;\n"
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def f(dtype):
f("float64")

# 10822
msg = "^Unknown datetime string format, unable to parse: aa, at position 0$"
msg = "^Unknown datetime string format, unable to parse: aa$"
with pytest.raises(ValueError, match=msg):
f("M8[ns]")

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,8 +507,8 @@ def test_parse_multiple_delimited_dates_with_swap_warnings():
with pytest.raises(
ValueError,
match=(
r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", '
r"at position 1. You might want to try:"
r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y". '
r"You might want to try:"
),
):
pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"])
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_infer_with_date_and_datetime(self):
def test_unparsable_strings_with_dt64_dtype(self):
# pre-2.0 these would be silently ignored and come back with object dtype
vals = ["aa"]
msg = "^Unknown datetime string format, unable to parse: aa, at position 0$"
msg = "^Unknown datetime string format, unable to parse: aa$"
with pytest.raises(ValueError, match=msg):
Series(vals, dtype="datetime64[ns]")

Expand Down
117 changes: 54 additions & 63 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,7 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache):
with pytest.raises(
ValueError,
match=(
'unconverted data remains when parsing with format "%Y%m%d": ".0", '
"at position 0"
'unconverted data remains when parsing with format "%Y%m%d": ".0". '
),
):
# https://github.com/pandas-dev/pandas/issues/50051
Expand Down Expand Up @@ -514,10 +513,9 @@ def test_to_datetime_parse_timezone_malformed(self, offset):

msg = "|".join(
[
r'^time data ".*" doesn\'t match format ".*", at position 0. '
r'^time data ".*" doesn\'t match format ".*". ' f"{PARSING_ERR_MSG}$",
r'^unconverted data remains when parsing with format ".*": ".*". '
f"{PARSING_ERR_MSG}$",
r'^unconverted data remains when parsing with format ".*": ".*", '
f"at position 0. {PARSING_ERR_MSG}$",
]
)
with pytest.raises(ValueError, match=msg):
Expand All @@ -539,7 +537,7 @@ def test_to_datetime_overflow(self):
# TODO: Timestamp raises ValueError("could not convert string to Timestamp")
# can we make these more consistent?
arg = "08335394550"
msg = 'Parsing "08335394550" to datetime overflows, at position 0'
msg = 'Parsing "08335394550" to datetime overflows'
with pytest.raises(OutOfBoundsDatetime, match=msg):
to_datetime(arg)

Expand Down Expand Up @@ -1309,8 +1307,8 @@ def test_datetime_bool_arrays_mixed(self, cache):
with pytest.raises(
ValueError,
match=(
r'^time data "True" doesn\'t match format "%Y%m%d", '
f"at position 1. {PARSING_ERR_MSG}$"
r'^time data "True" doesn\'t match format "%Y%m%d". '
f"{PARSING_ERR_MSG}$"
),
):
to_datetime(["20130101", True], cache=cache)
Expand Down Expand Up @@ -1345,12 +1343,12 @@ def test_datetime_invalid_scalar(self, value, format):

msg = "|".join(
[
r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0. '
r'^time data "a" doesn\'t match format "%H:%M:%S". '
f"{PARSING_ERR_MSG}$",
r'^Given date string "a" not likely a datetime, at position 0$',
r'^unconverted data remains when parsing with format "%H:%M:%S": "9", '
f"at position 0. {PARSING_ERR_MSG}$",
r"^second must be in 0..59: 00:01:99, at position 0$",
r'^Given date string "a" not likely a datetime$',
r'^unconverted data remains when parsing with format "%H:%M:%S": "9". '
f"{PARSING_ERR_MSG}$",
r"^second must be in 0..59: 00:01:99$",
]
)
with pytest.raises(ValueError, match=msg):
Expand All @@ -1368,7 +1366,7 @@ def test_datetime_outofbounds_scalar(self, value, format):
assert res is NaT

if format is not None:
msg = r'^time data ".*" doesn\'t match format ".*", at position 0.'
msg = r'^time data ".*" doesn\'t match format ".*"'
with pytest.raises(ValueError, match=msg):
to_datetime(value, errors="raise", format=format)
else:
Expand Down Expand Up @@ -1397,12 +1395,12 @@ def test_datetime_invalid_index(self, values, format):

msg = "|".join(
[
r'^Given date string "a" not likely a datetime, at position 0$',
r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0. '
r'^Given date string "a" not likely a datetime$',
r'^time data "a" doesn\'t match format "%H:%M:%S". '
f"{PARSING_ERR_MSG}$",
r'^unconverted data remains when parsing with format "%H:%M:%S": "9". '
f"{PARSING_ERR_MSG}$",
r'^unconverted data remains when parsing with format "%H:%M:%S": "9", '
f"at position 0. {PARSING_ERR_MSG}$",
r"^second must be in 0..59: 00:01:99, at position 0$",
r"^second must be in 0..59: 00:01:99$",
]
)
with pytest.raises(ValueError, match=msg):
Expand Down Expand Up @@ -1582,8 +1580,7 @@ def test_to_datetime_malformed_raise(self):
ts_strings = ["200622-12-31", "111111-24-11"]
msg = (
'Parsed string "200622-12-31" gives an invalid tzoffset, which must '
r"be between -timedelta\(hours=24\) and timedelta\(hours=24\), "
"at position 0"
r"be between -timedelta\(hours=24\) and timedelta\(hours=24\)"
)
with pytest.raises(
ValueError,
Expand Down Expand Up @@ -1748,7 +1745,7 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
with pytest.raises(ValueError, match=msg):
to_datetime(np.array([1.5]), unit=unit, errors="raise")

msg = r"Given date string \"1.5\" not likely a datetime, at position 0"
msg = r"Given date string \"1.5\" not likely a datetime"
with pytest.raises(ValueError, match=msg):
to_datetime(["1.5"], unit=unit, errors="raise")

Expand Down Expand Up @@ -1802,7 +1799,7 @@ def test_unit_array_mixed_nans_large_int(self, cache):
def test_to_datetime_invalid_str_not_out_of_bounds_valuerror(self, cache):
# if we have a string, then we raise a ValueError
# and NOT an OutOfBoundsDatetime
msg = "Unknown datetime string format, unable to parse: foo, at position 0"
msg = "Unknown datetime string format, unable to parse: foo"
with pytest.raises(ValueError, match=msg):
to_datetime("foo", errors="raise", unit="s", cache=cache)

Expand Down Expand Up @@ -1938,12 +1935,9 @@ def test_to_datetime_unit_na_values(self):
@pytest.mark.parametrize("bad_val", ["foo", 111111111])
def test_to_datetime_unit_invalid(self, bad_val):
if bad_val == "foo":
msg = (
"Unknown datetime string format, unable to parse: "
f"{bad_val}, at position 2"
)
msg = "Unknown datetime string format, unable to parse: " f"{bad_val}"
else:
msg = "cannot convert input 111111111 with the unit 'D', at position 2"
msg = "cannot convert input 111111111 with the unit 'D'"
with pytest.raises(ValueError, match=msg):
to_datetime([1, 2, bad_val], unit="D")

Expand Down Expand Up @@ -2096,7 +2090,7 @@ def test_dataframe_coerce(self, cache):

msg = (
r'^cannot assemble the datetimes: time data ".+" doesn\'t '
r'match format "%Y%m%d", at position 1\.'
r'match format "%Y%m%d"\.'
)
with pytest.raises(ValueError, match=msg):
to_datetime(df2, cache=cache)
Expand Down Expand Up @@ -2174,7 +2168,7 @@ def test_dataframe_float(self, cache):
df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]})
msg = (
r"^cannot assemble the datetimes: unconverted data remains when parsing "
r'with format ".*": "1", at position 0.'
r'with format ".*": "1".'
)
with pytest.raises(ValueError, match=msg):
to_datetime(df, cache=cache)
Expand All @@ -2196,7 +2190,7 @@ def test_to_datetime_barely_out_of_bounds(self):
# in an in-bounds datetime
arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object)

msg = "^Out of bounds nanosecond timestamp: .*, at position 0"
msg = "^Out of bounds nanosecond timestamp: .*"
with pytest.raises(OutOfBoundsDatetime, match=msg):
to_datetime(arr)

Expand Down Expand Up @@ -2231,10 +2225,7 @@ def test_to_datetime_iso8601_fails(self, input, format, exact):
# `format` is longer than the string, so this fails regardless of `exact`
with pytest.raises(
ValueError,
match=(
rf"time data \"{input}\" doesn't match format "
rf"\"{format}\", at position 0"
),
match=(rf"time data \"{input}\" doesn't match format " rf"\"{format}\""),
):
to_datetime(input, format=format, exact=exact)

Expand All @@ -2253,10 +2244,9 @@ def test_to_datetime_iso8601_exact_fails(self, input, format):
# `format` is shorter than the date string, so only fails with `exact=True`
msg = "|".join(
[
'^unconverted data remains when parsing with format ".*": ".*"'
f", at position 0. {PARSING_ERR_MSG}$",
f'^time data ".*" doesn\'t match format ".*", at position 0. '
'^unconverted data remains when parsing with format ".*": ".*". '
f"{PARSING_ERR_MSG}$",
f'^time data ".*" doesn\'t match format ".*". ' f"{PARSING_ERR_MSG}$",
]
)
with pytest.raises(
Expand Down Expand Up @@ -2297,10 +2287,7 @@ def test_to_datetime_iso8601_separator(self, input, format):
# https://github.com/pandas-dev/pandas/issues/12649
with pytest.raises(
ValueError,
match=(
rf"time data \"{input}\" doesn\'t match format "
rf"\"{format}\", at position 0"
),
match=(rf"time data \"{input}\" doesn\'t match format " rf"\"{format}\""),
):
to_datetime(input, format=format)

Expand Down Expand Up @@ -2390,8 +2377,7 @@ def test_to_datetime_with_space_in_series(self, cache):
# GH 6428
ser = Series(["10/18/2006", "10/18/2008", " "])
msg = (
r'^time data " " doesn\'t match format "%m/%d/%Y", '
rf"at position 2. {PARSING_ERR_MSG}$"
r'^time data " " doesn\'t match format "%m/%d/%Y". ' rf"{PARSING_ERR_MSG}$"
)
with pytest.raises(ValueError, match=msg):
to_datetime(ser, errors="raise", cache=cache)
Expand Down Expand Up @@ -2466,7 +2452,7 @@ def test_to_datetime_strings_vs_constructor(self, result):
def test_to_datetime_unprocessable_input(self, cache):
# GH 4928
# GH 21864
msg = '^Given date string "1" not likely a datetime, at position 1$'
msg = '^Given date string "1" not likely a datetime$'
with pytest.raises(ValueError, match=msg):
to_datetime([1, "1"], errors="raise", cache=cache)

Expand Down Expand Up @@ -2643,7 +2629,7 @@ def test_dayfirst_warnings_invalid_input(self):
ValueError,
match=(
r'^time data "03/30/2011" doesn\'t match format '
rf'"%d/%m/%Y", at position 1. {PARSING_ERR_MSG}$'
rf'"%d/%m/%Y". {PARSING_ERR_MSG}$'
),
):
to_datetime(arr, dayfirst=True)
Expand Down Expand Up @@ -2714,7 +2700,7 @@ def test_to_datetime_inconsistent_format(self, cache):
ser = Series(np.array(data))
msg = (
r'^time data "01-02-2011 00:00:00" doesn\'t match format '
rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$'
rf'"%m/%d/%Y %H:%M:%S". {PARSING_ERR_MSG}$'
)
with pytest.raises(ValueError, match=msg):
to_datetime(ser, cache=cache)
Expand Down Expand Up @@ -2820,7 +2806,7 @@ def test_day_not_in_month_coerce(self, cache, arg, format):
assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache))

def test_day_not_in_month_raise(self, cache):
msg = "day is out of range for month: 2015-02-29, at position 0"
msg = "day is out of range for month: 2015-02-29"
with pytest.raises(ValueError, match=msg):
to_datetime("2015-02-29", errors="raise", cache=cache)

Expand All @@ -2830,34 +2816,34 @@ def test_day_not_in_month_raise(self, cache):
(
"2015-02-29",
"%Y-%m-%d",
f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$",
f"^day is out of range for month. {PARSING_ERR_MSG}$",
),
(
"2015-29-02",
"%Y-%d-%m",
f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$",
f"^day is out of range for month. {PARSING_ERR_MSG}$",
),
(
"2015-02-32",
"%Y-%m-%d",
'^unconverted data remains when parsing with format "%Y-%m-%d": "2", '
f"at position 0. {PARSING_ERR_MSG}$",
'^unconverted data remains when parsing with format "%Y-%m-%d": "2". '
f"{PARSING_ERR_MSG}$",
),
(
"2015-32-02",
"%Y-%d-%m",
'^time data "2015-32-02" doesn\'t match format "%Y-%d-%m", '
f"at position 0. {PARSING_ERR_MSG}$",
'^time data "2015-32-02" doesn\'t match format "%Y-%d-%m". '
f"{PARSING_ERR_MSG}$",
),
(
"2015-04-31",
"%Y-%m-%d",
f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$",
f"^day is out of range for month. {PARSING_ERR_MSG}$",
),
(
"2015-31-04",
"%Y-%d-%m",
f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$",
f"^day is out of range for month. {PARSING_ERR_MSG}$",
),
],
)
Expand Down Expand Up @@ -3226,9 +3212,7 @@ def test_invalid_origins_tzinfo(self):

def test_incorrect_value_exception(self):
# GH47495
msg = (
"Unknown datetime string format, unable to parse: yesterday, at position 1"
)
msg = "Unknown datetime string format, unable to parse: yesterday"
with pytest.raises(ValueError, match=msg):
to_datetime(["today", "yesterday"])

Expand All @@ -3249,7 +3233,7 @@ def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning):
assert res.month == 10
assert res.day == 10
else:
msg = "unconverted data remains when parsing with format.*, at position 0"
msg = "unconverted data remains when parsing with format.*"
with pytest.raises(ValueError, match=msg):
to_datetime("2417-10-10 00:00:00.00", format=format)

Expand Down Expand Up @@ -3473,9 +3457,7 @@ def test_to_datetime_mixed_or_iso_exact(exact, format):

def test_to_datetime_mixed_not_necessarily_iso8601_raise():
# https://github.com/pandas-dev/pandas/issues/50411
with pytest.raises(
ValueError, match="Time data 01-01-2000 is not ISO8601 format, at position 1"
):
with pytest.raises(ValueError, match="Time data 01-01-2000 is not ISO8601 format"):
to_datetime(["2020-01-01", "01-01-2000"], format="ISO8601")


Expand All @@ -3500,6 +3482,15 @@ def test_unknown_tz_raises():
to_datetime([dtstr])


def test_unformatted_input_raises():
valid, invalid = "2024-01-01", "N"
ser = Series([valid] * start_caching_at + [invalid])
msg = 'time data "N" doesn\'t match format "%Y-%m-%d"'

with pytest.raises(ValueError, match=msg):
to_datetime(ser, format="%Y-%m-%d", exact=True, cache=True)


def test_from_numeric_arrow_dtype(any_numeric_ea_dtype):
# GH 52425
pytest.importorskip("pyarrow")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/tslibs/test_array_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def test_to_datetime_barely_out_of_bounds():
# Close enough to bounds that dropping nanos
# would result in an in-bounds datetime.
arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object)
msg = "^Out of bounds nanosecond timestamp: 2262-04-11 23:47:16, at position 0$"
msg = "^Out of bounds nanosecond timestamp: 2262-04-11 23:47:16$"

with pytest.raises(tslib.OutOfBoundsDatetime, match=msg):
tslib.array_to_datetime(arr)
Expand Down