From 52186cc137eda05cf9177313127921368442a67e Mon Sep 17 00:00:00 2001 From: maushumee Date: Sat, 24 Aug 2024 07:34:28 -0400 Subject: [PATCH 1/5] Remove index from error --- pandas/_libs/tslib.pyx | 2 +- pandas/_libs/tslibs/strptime.pyx | 2 +- pandas/tests/tools/test_to_datetime.py | 117 ++++++++++++------------- 3 files changed, 56 insertions(+), 65 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 928d253bf3169..3c5854602df53 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -439,7 +439,7 @@ cpdef array_to_datetime( raise TypeError(f"{type(val)} is not convertible to datetime") except (TypeError, OverflowError, ValueError) as ex: - ex.args = (f"{ex}, at position {i}",) + ex.args = (f"{ex}",) if is_coerce: iresult[i] = NPY_NAT continue diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index ccb1a1d6870f7..ed784b6f5ab22 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -536,7 +536,7 @@ def array_strptime( except ValueError as ex: ex.args = ( - f"{str(ex)}, at position {i}. You might want to try:\n" + f"{str(ex)}. You might want to try:\n" " - passing `format` if your strings have a consistent format;\n" " - passing `format='ISO8601'` if your strings are " "all ISO8601 but not necessarily in exactly the same format;\n" diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 658e16bfe5682..168d589070484 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -133,8 +133,7 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): with pytest.raises( ValueError, match=( - 'unconverted data remains when parsing with format "%Y%m%d": ".0", ' - "at position 0" + 'unconverted data remains when parsing with format "%Y%m%d": ".0". ' ), ): # https://github.com/pandas-dev/pandas/issues/50051 @@ -514,10 +513,9 @@ def test_to_datetime_parse_timezone_malformed(self, offset): msg = "|".join( [ - r'^time data ".*" doesn\'t match format ".*", at position 0. ' + r'^time data ".*" doesn\'t match format ".*". ' f"{PARSING_ERR_MSG}$", + r'^unconverted data remains when parsing with format ".*": ".*". ' f"{PARSING_ERR_MSG}$", - r'^unconverted data remains when parsing with format ".*": ".*", ' - f"at position 0. {PARSING_ERR_MSG}$", ] ) with pytest.raises(ValueError, match=msg): @@ -539,7 +537,7 @@ def test_to_datetime_overflow(self): # TODO: Timestamp raises ValueError("could not convert string to Timestamp") # can we make these more consistent? arg = "08335394550" - msg = 'Parsing "08335394550" to datetime overflows, at position 0' + msg = 'Parsing "08335394550" to datetime overflows' with pytest.raises(OutOfBoundsDatetime, match=msg): to_datetime(arg) @@ -1309,8 +1307,8 @@ def test_datetime_bool_arrays_mixed(self, cache): with pytest.raises( ValueError, match=( - r'^time data "True" doesn\'t match format "%Y%m%d", ' - f"at position 1. {PARSING_ERR_MSG}$" + r'^time data "True" doesn\'t match format "%Y%m%d". ' + f"{PARSING_ERR_MSG}$" ), ): to_datetime(["20130101", True], cache=cache) @@ -1345,12 +1343,12 @@ def test_datetime_invalid_scalar(self, value, format): msg = "|".join( [ - r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0. ' + r'^time data "a" doesn\'t match format "%H:%M:%S". ' f"{PARSING_ERR_MSG}$", - r'^Given date string "a" not likely a datetime, at position 0$', - r'^unconverted data remains when parsing with format "%H:%M:%S": "9", ' - f"at position 0. {PARSING_ERR_MSG}$", - r"^second must be in 0..59: 00:01:99, at position 0$", + r'^Given date string "a" not likely a datetime$', + r'^unconverted data remains when parsing with format "%H:%M:%S": "9". ' + f"{PARSING_ERR_MSG}$", + r"^second must be in 0..59: 00:01:99$", ] ) with pytest.raises(ValueError, match=msg): @@ -1368,7 +1366,7 @@ def test_datetime_outofbounds_scalar(self, value, format): assert res is NaT if format is not None: - msg = r'^time data ".*" doesn\'t match format ".*", at position 0.' + msg = r'^time data ".*" doesn\'t match format ".*"' with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: @@ -1397,12 +1395,12 @@ def test_datetime_invalid_index(self, values, format): msg = "|".join( [ - r'^Given date string "a" not likely a datetime, at position 0$', - r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0. ' + r'^Given date string "a" not likely a datetime$', + r'^time data "a" doesn\'t match format "%H:%M:%S". ' + f"{PARSING_ERR_MSG}$", + r'^unconverted data remains when parsing with format "%H:%M:%S": "9". ' f"{PARSING_ERR_MSG}$", - r'^unconverted data remains when parsing with format "%H:%M:%S": "9", ' - f"at position 0. {PARSING_ERR_MSG}$", - r"^second must be in 0..59: 00:01:99, at position 0$", + r"^second must be in 0..59: 00:01:99$", ] ) with pytest.raises(ValueError, match=msg): @@ -1582,8 +1580,7 @@ def test_to_datetime_malformed_raise(self): ts_strings = ["200622-12-31", "111111-24-11"] msg = ( 'Parsed string "200622-12-31" gives an invalid tzoffset, which must ' - r"be between -timedelta\(hours=24\) and timedelta\(hours=24\), " - "at position 0" + r"be between -timedelta\(hours=24\) and timedelta\(hours=24\)" ) with pytest.raises( ValueError, @@ -1748,7 +1745,7 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): with pytest.raises(ValueError, match=msg): to_datetime(np.array([1.5]), unit=unit, errors="raise") - msg = r"Given date string \"1.5\" not likely a datetime, at position 0" + msg = r"Given date string \"1.5\" not likely a datetime" with pytest.raises(ValueError, match=msg): to_datetime(["1.5"], unit=unit, errors="raise") @@ -1802,7 +1799,7 @@ def test_unit_array_mixed_nans_large_int(self, cache): def test_to_datetime_invalid_str_not_out_of_bounds_valuerror(self, cache): # if we have a string, then we raise a ValueError # and NOT an OutOfBoundsDatetime - msg = "Unknown datetime string format, unable to parse: foo, at position 0" + msg = "Unknown datetime string format, unable to parse: foo" with pytest.raises(ValueError, match=msg): to_datetime("foo", errors="raise", unit="s", cache=cache) @@ -1938,12 +1935,9 @@ def test_to_datetime_unit_na_values(self): @pytest.mark.parametrize("bad_val", ["foo", 111111111]) def test_to_datetime_unit_invalid(self, bad_val): if bad_val == "foo": - msg = ( - "Unknown datetime string format, unable to parse: " - f"{bad_val}, at position 2" - ) + msg = "Unknown datetime string format, unable to parse: " f"{bad_val}" else: - msg = "cannot convert input 111111111 with the unit 'D', at position 2" + msg = "cannot convert input 111111111 with the unit 'D'" with pytest.raises(ValueError, match=msg): to_datetime([1, 2, bad_val], unit="D") @@ -2096,7 +2090,7 @@ def test_dataframe_coerce(self, cache): msg = ( r'^cannot assemble the datetimes: time data ".+" doesn\'t ' - r'match format "%Y%m%d", at position 1\.' + r'match format "%Y%m%d"\.' ) with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) @@ -2174,7 +2168,7 @@ def test_dataframe_float(self, cache): df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) msg = ( r"^cannot assemble the datetimes: unconverted data remains when parsing " - r'with format ".*": "1", at position 0.' + r'with format ".*": "1".' ) with pytest.raises(ValueError, match=msg): to_datetime(df, cache=cache) @@ -2196,7 +2190,7 @@ def test_to_datetime_barely_out_of_bounds(self): # in an in-bounds datetime arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object) - msg = "^Out of bounds nanosecond timestamp: .*, at position 0" + msg = "^Out of bounds nanosecond timestamp: .*" with pytest.raises(OutOfBoundsDatetime, match=msg): to_datetime(arr) @@ -2231,10 +2225,7 @@ def test_to_datetime_iso8601_fails(self, input, format, exact): # `format` is longer than the string, so this fails regardless of `exact` with pytest.raises( ValueError, - match=( - rf"time data \"{input}\" doesn't match format " - rf"\"{format}\", at position 0" - ), + match=(rf"time data \"{input}\" doesn't match format " rf"\"{format}\""), ): to_datetime(input, format=format, exact=exact) @@ -2253,10 +2244,9 @@ def test_to_datetime_iso8601_exact_fails(self, input, format): # `format` is shorter than the date string, so only fails with `exact=True` msg = "|".join( [ - '^unconverted data remains when parsing with format ".*": ".*"' - f", at position 0. {PARSING_ERR_MSG}$", - f'^time data ".*" doesn\'t match format ".*", at position 0. ' + '^unconverted data remains when parsing with format ".*": ".*". ' f"{PARSING_ERR_MSG}$", + f'^time data ".*" doesn\'t match format ".*". ' f"{PARSING_ERR_MSG}$", ] ) with pytest.raises( @@ -2297,10 +2287,7 @@ def test_to_datetime_iso8601_separator(self, input, format): # https://github.com/pandas-dev/pandas/issues/12649 with pytest.raises( ValueError, - match=( - rf"time data \"{input}\" doesn\'t match format " - rf"\"{format}\", at position 0" - ), + match=(rf"time data \"{input}\" doesn\'t match format " rf"\"{format}\""), ): to_datetime(input, format=format) @@ -2390,8 +2377,7 @@ def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) msg = ( - r'^time data " " doesn\'t match format "%m/%d/%Y", ' - rf"at position 2. {PARSING_ERR_MSG}$" + r'^time data " " doesn\'t match format "%m/%d/%Y". ' rf"{PARSING_ERR_MSG}$" ) with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) @@ -2466,7 +2452,7 @@ def test_to_datetime_strings_vs_constructor(self, result): def test_to_datetime_unprocessable_input(self, cache): # GH 4928 # GH 21864 - msg = '^Given date string "1" not likely a datetime, at position 1$' + msg = '^Given date string "1" not likely a datetime$' with pytest.raises(ValueError, match=msg): to_datetime([1, "1"], errors="raise", cache=cache) @@ -2643,7 +2629,7 @@ def test_dayfirst_warnings_invalid_input(self): ValueError, match=( r'^time data "03/30/2011" doesn\'t match format ' - rf'"%d/%m/%Y", at position 1. {PARSING_ERR_MSG}$' + rf'"%d/%m/%Y". {PARSING_ERR_MSG}$' ), ): to_datetime(arr, dayfirst=True) @@ -2714,7 +2700,7 @@ def test_to_datetime_inconsistent_format(self, cache): ser = Series(np.array(data)) msg = ( r'^time data "01-02-2011 00:00:00" doesn\'t match format ' - rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$' + rf'"%m/%d/%Y %H:%M:%S". {PARSING_ERR_MSG}$' ) with pytest.raises(ValueError, match=msg): to_datetime(ser, cache=cache) @@ -2820,7 +2806,7 @@ def test_day_not_in_month_coerce(self, cache, arg, format): assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache)) def test_day_not_in_month_raise(self, cache): - msg = "day is out of range for month: 2015-02-29, at position 0" + msg = "day is out of range for month: 2015-02-29" with pytest.raises(ValueError, match=msg): to_datetime("2015-02-29", errors="raise", cache=cache) @@ -2830,34 +2816,34 @@ def test_day_not_in_month_raise(self, cache): ( "2015-02-29", "%Y-%m-%d", - f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", + f"^day is out of range for month. {PARSING_ERR_MSG}$", ), ( "2015-29-02", "%Y-%d-%m", - f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", + f"^day is out of range for month. {PARSING_ERR_MSG}$", ), ( "2015-02-32", "%Y-%m-%d", - '^unconverted data remains when parsing with format "%Y-%m-%d": "2", ' - f"at position 0. {PARSING_ERR_MSG}$", + '^unconverted data remains when parsing with format "%Y-%m-%d": "2". ' + f"{PARSING_ERR_MSG}$", ), ( "2015-32-02", "%Y-%d-%m", - '^time data "2015-32-02" doesn\'t match format "%Y-%d-%m", ' - f"at position 0. {PARSING_ERR_MSG}$", + '^time data "2015-32-02" doesn\'t match format "%Y-%d-%m". ' + f"{PARSING_ERR_MSG}$", ), ( "2015-04-31", "%Y-%m-%d", - f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", + f"^day is out of range for month. {PARSING_ERR_MSG}$", ), ( "2015-31-04", "%Y-%d-%m", - f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", + f"^day is out of range for month. {PARSING_ERR_MSG}$", ), ], ) @@ -3226,9 +3212,7 @@ def test_invalid_origins_tzinfo(self): def test_incorrect_value_exception(self): # GH47495 - msg = ( - "Unknown datetime string format, unable to parse: yesterday, at position 1" - ) + msg = "Unknown datetime string format, unable to parse: yesterday" with pytest.raises(ValueError, match=msg): to_datetime(["today", "yesterday"]) @@ -3249,7 +3233,7 @@ def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): assert res.month == 10 assert res.day == 10 else: - msg = "unconverted data remains when parsing with format.*, at position 0" + msg = "unconverted data remains when parsing with format.*" with pytest.raises(ValueError, match=msg): to_datetime("2417-10-10 00:00:00.00", format=format) @@ -3473,9 +3457,7 @@ def test_to_datetime_mixed_or_iso_exact(exact, format): def test_to_datetime_mixed_not_necessarily_iso8601_raise(): # https://github.com/pandas-dev/pandas/issues/50411 - with pytest.raises( - ValueError, match="Time data 01-01-2000 is not ISO8601 format, at position 1" - ): + with pytest.raises(ValueError, match="Time data 01-01-2000 is not ISO8601 format"): to_datetime(["2020-01-01", "01-01-2000"], format="ISO8601") @@ -3500,6 +3482,15 @@ def test_unknown_tz_raises(): to_datetime([dtstr]) +def test_unformatted_input_raises(): + valid, invalid = "2024-01-01", "N" + S = Series([valid] * 45000 + [invalid]) + msg = 'time data "N" doesn\'t match format "%Y-%m-%d"' + + with pytest.raises(ValueError, match=msg): + to_datetime(S, format="%Y-%m-%d", exact=True) + + def test_from_numeric_arrow_dtype(any_numeric_ea_dtype): # GH 52425 pytest.importorskip("pyarrow") From 53cd943971a9c8da7055a8025534402ef946d6c3 Mon Sep 17 00:00:00 2001 From: maushumee Date: Sat, 24 Aug 2024 08:13:24 -0400 Subject: [PATCH 2/5] Fix test failures --- pandas/tests/frame/test_block_internals.py | 2 +- pandas/tests/io/parser/test_parse_dates.py | 4 ++-- pandas/tests/series/test_constructors.py | 2 +- pandas/tests/tslibs/test_array_to_datetime.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 47eb387abc8e8..25e66a0e1c03d 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -251,7 +251,7 @@ def f(dtype): f("float64") # 10822 - msg = "^Unknown datetime string format, unable to parse: aa, at position 0$" + msg = "^Unknown datetime string format, unable to parse: aa$" with pytest.raises(ValueError, match=msg): f("M8[ns]") diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 386348c4bd687..532fcc5cd880c 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -507,8 +507,8 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): with pytest.raises( ValueError, match=( - r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", ' - r"at position 1. You might want to try:" + r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y". ' + r"You might want to try:" ), ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 57b14d4b82a63..1771a4dfdb71f 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -83,7 +83,7 @@ def test_infer_with_date_and_datetime(self): def test_unparsable_strings_with_dt64_dtype(self): # pre-2.0 these would be silently ignored and come back with object dtype vals = ["aa"] - msg = "^Unknown datetime string format, unable to parse: aa, at position 0$" + msg = "^Unknown datetime string format, unable to parse: aa$" with pytest.raises(ValueError, match=msg): Series(vals, dtype="datetime64[ns]") diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 3c55ae2c6f904..fc0000553049e 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -260,7 +260,7 @@ def test_to_datetime_barely_out_of_bounds(): # Close enough to bounds that dropping nanos # would result in an in-bounds datetime. arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object) - msg = "^Out of bounds nanosecond timestamp: 2262-04-11 23:47:16, at position 0$" + msg = "^Out of bounds nanosecond timestamp: 2262-04-11 23:47:16$" with pytest.raises(tslib.OutOfBoundsDatetime, match=msg): tslib.array_to_datetime(arr) From 4e9c24cfb8a3dcb9e953df8bfbcc8e5d4005b996 Mon Sep 17 00:00:00 2001 From: maushumee Date: Sat, 24 Aug 2024 08:56:58 -0400 Subject: [PATCH 3/5] Add entry to docs/source/whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1533f9267ce39..9cb2af555a445 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -553,6 +553,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`) - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) +- Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta From 632469a4da9c12178b0de7120e4572d3eabc9a30 Mon Sep 17 00:00:00 2001 From: Maushumee Date: Mon, 26 Aug 2024 17:29:13 -0400 Subject: [PATCH 4/5] Update pandas/tests/tools/test_to_datetime.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/tools/test_to_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 168d589070484..e030b2b8a8c82 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3484,7 +3484,7 @@ def test_unknown_tz_raises(): def test_unformatted_input_raises(): valid, invalid = "2024-01-01", "N" - S = Series([valid] * 45000 + [invalid]) + ser = Series([valid] * start_caching_at + [invalid]) msg = 'time data "N" doesn\'t match format "%Y-%m-%d"' with pytest.raises(ValueError, match=msg): From 1198125d6e93952eb4ce0b2e9d1ed391d8b6463c Mon Sep 17 00:00:00 2001 From: Maushumee Date: Mon, 26 Aug 2024 17:29:24 -0400 Subject: [PATCH 5/5] Update pandas/tests/tools/test_to_datetime.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/tools/test_to_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index e030b2b8a8c82..a9d3c235f63f6 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3488,7 +3488,7 @@ def test_unformatted_input_raises(): msg = 'time data "N" doesn\'t match format "%Y-%m-%d"' with pytest.raises(ValueError, match=msg): - to_datetime(S, format="%Y-%m-%d", exact=True) + to_datetime(ser, format="%Y-%m-%d", exact=True, cache=True) def test_from_numeric_arrow_dtype(any_numeric_ea_dtype):