diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9957ccb4fde50..e93dcebf20e3e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -101,6 +101,7 @@ Other enhancements - Added :meth:`Index.infer_objects` analogous to :meth:`Series.infer_objects` (:issue:`50034`) - Added ``copy`` parameter to :meth:`Series.infer_objects` and :meth:`DataFrame.infer_objects`, passing ``False`` will avoid making copies for series or columns that are already non-object or where no better dtype can be inferred (:issue:`50096`) - :meth:`DataFrame.plot.hist` now recognizes ``xlabel`` and ``ylabel`` arguments (:issue:`49793`) +- Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`) - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`) - diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 6d834c5494a83..26317da62c8d9 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -263,7 +263,7 @@ def parse_datetime_string( datetime dt if not _does_string_look_like_datetime(date_string): - raise ValueError(f"Given date string {date_string} not likely a datetime") + raise ValueError(f'Given date string "{date_string}" not likely a datetime') if does_string_look_like_time(date_string): # use current datetime as default, not pass _DEFAULT_DATETIME @@ -297,7 +297,7 @@ def parse_datetime_string( except TypeError: # following may be raised from dateutil # TypeError: 'NoneType' object is not iterable - raise ValueError(f"Given date string {date_string} not likely a datetime") + raise ValueError(f'Given date string "{date_string}" not likely a datetime') return dt @@ -373,7 +373,7 @@ cdef parse_datetime_string_with_reso( int out_tzoffset if not _does_string_look_like_datetime(date_string): - raise ValueError(f"Given date string {date_string} not likely a datetime") + raise ValueError(f'Given date string "{date_string}" not likely a datetime') parsed, reso = _parse_delimited_date(date_string, dayfirst) if parsed is not None: diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 73e9176d3a6d2..c1bc5fd0910f8 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -236,17 +236,22 @@ def array_strptime( if exact: found = format_regex.match(val) if not found: - raise ValueError(f"time data '{val}' does not match " - f"format '{fmt}' (match)") + raise ValueError(f"time data \"{val}\" at position {i} doesn't " + f"match format \"{fmt}\"") if len(val) != found.end(): - raise ValueError(f"unconverted data remains: {val[found.end():]}") + raise ValueError( + f"unconverted data remains at position {i}: " + f'"{val[found.end():]}"' + ) # search else: found = format_regex.search(val) if not found: - raise ValueError(f"time data {repr(val)} does not match format " - f"{repr(fmt)} (search)") + raise ValueError( + f"time data \"{val}\" at position {i} doesn't match " + f"format \"{fmt}\"" + ) iso_year = -1 year = 1900 @@ -396,7 +401,9 @@ def array_strptime( result_timezone[i] = tz - except (ValueError, OutOfBoundsDatetime): + except (ValueError, OutOfBoundsDatetime) as ex: + if isinstance(ex, OutOfBoundsDatetime): + ex.args = (f"{str(ex)} present at position {i}",) if is_coerce: iresult[i] = NPY_NAT continue diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 080410b0c913a..766b8fe805419 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1720,7 +1720,9 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): # GH46210 with pytest.raises( ValueError, - match=r"^time data '31/05/2000' does not match format '%m/%d/%Y' \(match\)$", + match=( + r'^time data "31/05/2000" at position 1 doesn\'t match format "%m/%d/%Y"$' + ), ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 112f23b3b0f16..0636ecb023530 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -303,7 +303,7 @@ def test_invalid_arguments(self): with pytest.raises(ValueError, match=msg): Period(month=1) - msg = "Given date string -2000 not likely a datetime" + msg = '^Given date string "-2000" not likely a datetime$' with pytest.raises(ValueError, match=msg): Period("-2000", "A") msg = "day is out of range for month" diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 83e40f5f1d98b..927388408cf27 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -479,7 +479,12 @@ def test_to_datetime_parse_timezone_malformed(self, offset): fmt = "%Y-%m-%d %H:%M:%S %z" date = "2010-01-01 12:00:00 " + offset - msg = "does not match format|unconverted data remains" + msg = "|".join( + [ + r'^time data ".*" at position 0 doesn\'t match format ".*"$', + r'^unconverted data remains at position 0: ".*"$', + ] + ) with pytest.raises(ValueError, match=msg): to_datetime([date], format=fmt) @@ -1093,7 +1098,7 @@ def test_datetime_bool_arrays_mixed(self, cache): to_datetime([False, datetime.today()], cache=cache) with pytest.raises( ValueError, - match=r"^time data 'True' does not match format '%Y%m%d' \(match\)$", + match=r'^time data "True" at position 1 doesn\'t match format "%Y%m%d"$', ): to_datetime(["20130101", True], cache=cache) tm.assert_index_equal( @@ -1132,11 +1137,13 @@ def test_datetime_invalid_scalar(self, value, format, warning): res = to_datetime(value, errors="coerce", format=format) assert res is NaT - msg = ( - "does not match format|" - "unconverted data remains:|" - "second must be in 0..59|" - f"Given date string {value} not likely a datetime" + msg = "|".join( + [ + r'^time data "a" at position 0 doesn\'t match format "%H:%M:%S"$', + r'^Given date string "a" not likely a datetime present at position 0$', + r'^unconverted data remains at position 0: "9"$', + r"^second must be in 0..59: 00:01:99 present at position 0$", + ] ) with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(warning, match="Could not infer format"): @@ -1157,7 +1164,7 @@ def test_datetime_outofbounds_scalar(self, value, format, warning): assert res is NaT if format is not None: - msg = "does not match format|Out of bounds .* present at position 0" + msg = r'^time data ".*" at position 0 doesn\'t match format ".*"$' with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: @@ -1181,11 +1188,13 @@ def test_datetime_invalid_index(self, values, format, warning): res = to_datetime(values, errors="coerce", format=format) tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) - msg = ( - "does not match format|" - "unconverted data remains:|" - f"Given date string {values[0]} not likely a datetime|" - "second must be in 0..59" + msg = "|".join( + [ + r'^Given date string "a" not likely a datetime present at position 0$', + r'^time data "a" at position 0 doesn\'t match format "%H:%M:%S"$', + r'^unconverted data remains at position 0: "9"$', + r"^second must be in 0..59: 00:01:99 present at position 0$", + ] ) with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(warning, match="Could not infer format"): @@ -1805,8 +1814,8 @@ def test_dataframe_coerce(self, cache): df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) msg = ( - "cannot assemble the datetimes: time data .+ does not " - r"match format '%Y%m%d' \(match\)" + r'^cannot assemble the datetimes: time data ".+" at position 1 doesn\'t ' + r'match format "%Y%m%d"$' ) with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) @@ -1882,7 +1891,10 @@ def test_dataframe_mixed(self, cache): def test_dataframe_float(self, cache): # float df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) - msg = "cannot assemble the datetimes: unconverted data remains: 1" + msg = ( + r"^cannot assemble the datetimes: unconverted data remains at position " + r'0: "1"$' + ) with pytest.raises(ValueError, match=msg): to_datetime(df, cache=cache) @@ -2072,7 +2084,7 @@ def test_to_datetime_on_datetime64_series(self, cache): def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) - msg = r"^time data ' ' does not match format '%m/%d/%Y' \(match\)$" + msg = r'^time data " " at position 2 doesn\'t match format "%m/%d/%Y"$' with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) @@ -2342,7 +2354,10 @@ def test_dayfirst_warnings_invalid_input(self): with pytest.raises( ValueError, - match=r"time data '03/30/2011' does not match format '%d/%m/%Y' \(match\)$", + match=( + r'^time data "03/30/2011" at position 1 doesn\'t match format ' + r'"%d/%m/%Y"$' + ), ): to_datetime(arr, dayfirst=True) @@ -2410,7 +2425,11 @@ def test_to_datetime_infer_datetime_format_consistent_format( def test_to_datetime_inconsistent_format(self, cache): data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] ser = Series(np.array(data)) - with pytest.raises(ValueError, match="does not match format"): + msg = ( + r'^time data "01-02-2011 00:00:00" at position 1 doesn\'t match format ' + r'"%m/%d/%Y %H:%M:%S"$' + ) + with pytest.raises(ValueError, match=msg): to_datetime(ser, cache=cache) def test_to_datetime_consistent_format(self, cache): @@ -2923,17 +2942,22 @@ def test_incorrect_value_exception(self): to_datetime(["today", "yesterday"]) @pytest.mark.parametrize( - "format, warning", [(None, UserWarning), ("%Y-%m-%d %H:%M:%S", None)] + "format, warning", + [ + (None, UserWarning), + ("%Y-%m-%d %H:%M:%S", None), + ("%Y-%d-%m %H:%M:%S", None), + ], ) def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 msg = ( - "Out of bounds nanosecond timestamp: 2417-10-27 00:00:00 " - "present at position 0" + r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00 " + r"present at position 0$" ) with pytest.raises(OutOfBoundsDatetime, match=msg): with tm.assert_produces_warning(warning, match="Could not infer format"): - to_datetime("2417-10-27 00:00:00", format=format) + to_datetime("2417-10-10 00:00:00", format=format) @pytest.mark.parametrize( "arg, origin, expected_str",