pandas-dev · MarcoGorelli · Dec 28, 2022 · Dec 20, 2022 · Dec 21, 2022 · Dec 21, 2022
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -98,6 +98,7 @@ Other enhancements
 - Added :meth:`Index.infer_objects` analogous to :meth:`Series.infer_objects` (:issue:`50034`)
 - Added ``copy`` parameter to :meth:`Series.infer_objects` and :meth:`DataFrame.infer_objects`, passing ``False`` will avoid making copies for series or columns that are already non-object or where no better dtype can be inferred (:issue:`50096`)
 - :meth:`DataFrame.plot.hist` now recognizes ``xlabel`` and ``ylabel`` arguments (:issue:`49793`)
+- Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -515,9 +515,10 @@ cpdef array_to_datetime(
                             iresult[i] = NPY_NAT
                             continue
                         elif is_raise:
+                            match_msg = "match" if exact else "search"
                             raise ValueError(
                                 f"time data \"{val}\" at position {i} doesn't "
-                                f"match format \"{format}\""
+                                f"match format \"{format}\" ({match_msg})"
                             )
                         return values, tz_out
                     # these must be ns unit by-definition
@@ -564,9 +565,10 @@ cpdef array_to_datetime(
                                 iresult[i] = NPY_NAT
                                 continue
                             elif is_raise:
+                                match_msg = "match" if exact else "search"
                                 raise ValueError(
                                     f"time data \"{val}\" at position {i} doesn't "
-                                    f"match format \"{format}\""
+                                    f"match format \"{format}\" ({match_msg})"
                                 )
                             return values, tz_out
 

diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -263,7 +263,7 @@ def parse_datetime_string(
         datetime dt
 
     if not _does_string_look_like_datetime(date_string):
-        raise ValueError(f"Given date string {date_string} not likely a datetime")
+        raise ValueError(f'Given date string "{date_string}" not likely a datetime')
 
     if does_string_look_like_time(date_string):
         # use current datetime as default, not pass _DEFAULT_DATETIME
@@ -297,7 +297,7 @@ def parse_datetime_string(
     except TypeError:
         # following may be raised from dateutil
         # TypeError: 'NoneType' object is not iterable
-        raise ValueError(f"Given date string {date_string} not likely a datetime")
+        raise ValueError(f'Given date string "{date_string}" not likely a datetime')
 
     return dt
 
@@ -373,7 +373,7 @@ cdef parse_datetime_string_with_reso(
         int out_tzoffset
 
     if not _does_string_look_like_datetime(date_string):
-        raise ValueError(f"Given date string {date_string} not likely a datetime")
+        raise ValueError(f'Given date string "{date_string}" not likely a datetime')
 
     parsed, reso = _parse_delimited_date(date_string, dayfirst)
     if parsed is not None:

diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
@@ -236,17 +236,22 @@ def array_strptime(
             if exact:
                 found = format_regex.match(val)
                 if not found:
-                    raise ValueError(f"time data '{val}' does not match "
-                                     f"format '{fmt}' (match)")
+                    raise ValueError(f"time data \"{val}\" at position {i} doesn't "
+                                     f"match format \"{fmt}\" (match)")
                 if len(val) != found.end():
-                    raise ValueError(f"unconverted data remains: {val[found.end():]}")
+                    raise ValueError(
+                        f"unconverted data remains at position {i}: "
+                        f'"{val[found.end():]}"'
+                    )
 
             # search
             else:
                 found = format_regex.search(val)
                 if not found:
-                    raise ValueError(f"time data {repr(val)} does not match format "
-                                     f"{repr(fmt)} (search)")
+                    raise ValueError(
+                        f"time data \"{val}\" at position {i} doesn't match "
+                        f"format \"{fmt}\" (search)"
+                    )
 
             iso_year = -1
             year = 1900
@@ -396,7 +401,15 @@ def array_strptime(
 
             result_timezone[i] = tz
 
-        except (ValueError, OutOfBoundsDatetime):
+        except OutOfBoundsDatetime as ex:
+            ex.args = (f"{str(ex)} present at position {i}",)
+            if is_coerce:
+                iresult[i] = NPY_NAT
+                continue
+            elif is_raise:
+                raise
+            return values, []
+        except ValueError:
             if is_coerce:
                 iresult[i] = NPY_NAT
                 continue

diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -1717,7 +1717,10 @@ def test_parse_multiple_delimited_dates_with_swap_warnings():
     # GH46210
     with pytest.raises(
         ValueError,
-        match=r"^time data '31/05/2000' does not match format '%m/%d/%Y' \(match\)$",
+        match=(
+            r'^time data "31/05/2000" at position 1 doesn\'t match format "%m/%d/%Y" '
+            r"\(match\)$"
+        ),
     ):
         pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"])
 

diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py
@@ -303,7 +303,7 @@ def test_invalid_arguments(self):
         with pytest.raises(ValueError, match=msg):
             Period(month=1)
 
-        msg = "Given date string -2000 not likely a datetime"
+        msg = '^Given date string "-2000" not likely a datetime$'
         with pytest.raises(ValueError, match=msg):
             Period("-2000", "A")
         msg = "day is out of range for month"

diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
@@ -479,7 +479,10 @@ def test_to_datetime_parse_timezone_malformed(self, offset):
         fmt = "%Y-%m-%d %H:%M:%S %z"
         date = "2010-01-01 12:00:00 " + offset
 
-        msg = "does not match format|unconverted data remains"
+        msg = (
+            r'^(time data ".*" at position 0 doesn\'t match format ".*" \(match\)|'
+            r'unconverted data remains at position 0: ".*")$'
+        )
         with pytest.raises(ValueError, match=msg):
             to_datetime([date], format=fmt)
 
@@ -1093,7 +1096,10 @@ def test_datetime_bool_arrays_mixed(self, cache):
             to_datetime([False, datetime.today()], cache=cache)
         with pytest.raises(
             ValueError,
-            match=r"^time data 'True' does not match format '%Y%m%d' \(match\)$",
+            match=(
+                r'^time data "True" at position 1 doesn\'t match format "%Y%m%d" '
+                r"\(match\)$"
+            ),
         ):
             to_datetime(["20130101", True], cache=cache)
         tm.assert_index_equal(
@@ -1133,10 +1139,10 @@ def test_datetime_invalid_scalar(self, value, format, warning):
         assert res is NaT
 
         msg = (
-            "does not match format|"
-            "unconverted data remains:|"
-            "second must be in 0..59|"
-            f"Given date string {value} not likely a datetime"
+            r'^(time data "a" at position 0 doesn\'t match format "%H:%M:%S" \(match\)|'
+            r'Given date string "a" not likely a datetime present at position 0|'
+            r'unconverted data remains at position 0: "9"|'
+            r"second must be in 0..59: 00:01:99 present at position 0)$"
         )
         with pytest.raises(ValueError, match=msg):
             with tm.assert_produces_warning(warning, match="Could not infer format"):
@@ -1157,7 +1163,7 @@ def test_datetime_outofbounds_scalar(self, value, format, warning):
         assert res is NaT
 
         if format is not None:
-            msg = "does not match format|Out of bounds .* present at position 0"
+            msg = r'^time data ".*" at position 0 doesn\'t match format ".*" \(match\)$'
             with pytest.raises(ValueError, match=msg):
                 to_datetime(value, errors="raise", format=format)
         else:
@@ -1182,10 +1188,10 @@ def test_datetime_invalid_index(self, values, format, warning):
         tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values)))
 
         msg = (
-            "does not match format|"
-            "unconverted data remains:|"
-            f"Given date string {values[0]} not likely a datetime|"
-            "second must be in 0..59"
+            r'^(Given date string "a" not likely a datetime present at position 0|'
+            r'time data "a" at position 0 doesn\'t match format "%H:%M:%S" \(match\)|'
+            r'unconverted data remains at position 0: "9"|'
+            r"second must be in 0..59: 00:01:99 present at position 0)$"
         )
         with pytest.raises(ValueError, match=msg):
             with tm.assert_produces_warning(warning, match="Could not infer format"):
@@ -1805,8 +1811,8 @@ def test_dataframe_coerce(self, cache):
         df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]})
 
         msg = (
-            "cannot assemble the datetimes: time data .+ does not "
-            r"match format '%Y%m%d' \(match\)"
+            r'^cannot assemble the datetimes: time data ".+" at position 1 doesn\'t '
+            r'match format "%Y%m%d" \(match\)$'
         )
         with pytest.raises(ValueError, match=msg):
             to_datetime(df2, cache=cache)
@@ -1882,7 +1888,10 @@ def test_dataframe_mixed(self, cache):
     def test_dataframe_float(self, cache):
         # float
         df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]})
-        msg = "cannot assemble the datetimes: unconverted data remains: 1"
+        msg = (
+            r"^cannot assemble the datetimes: unconverted data remains at position "
+            r'0: "1"$'
+        )
         with pytest.raises(ValueError, match=msg):
             to_datetime(df, cache=cache)
 
@@ -2072,7 +2081,9 @@ def test_to_datetime_on_datetime64_series(self, cache):
     def test_to_datetime_with_space_in_series(self, cache):
         # GH 6428
         ser = Series(["10/18/2006", "10/18/2008", " "])
-        msg = r"^time data ' ' does not match format '%m/%d/%Y' \(match\)$"
+        msg = (
+            r'^time data " " at position 2 doesn\'t match format "%m/%d/%Y" \(match\)$'
+        )
         with pytest.raises(ValueError, match=msg):
             to_datetime(ser, errors="raise", cache=cache)
         result_coerce = to_datetime(ser, errors="coerce", cache=cache)
@@ -2342,7 +2353,10 @@ def test_dayfirst_warnings_invalid_input(self):
 
         with pytest.raises(
             ValueError,
-            match=r"time data '03/30/2011' does not match format '%d/%m/%Y' \(match\)$",
+            match=(
+                r'^time data "03/30/2011" at position 1 doesn\'t match format '
+                r'"%d/%m/%Y" \(match\)$'
+            ),
         ):
             to_datetime(arr, dayfirst=True)
 
@@ -2410,7 +2424,11 @@ def test_to_datetime_infer_datetime_format_consistent_format(
     def test_to_datetime_inconsistent_format(self, cache):
         data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"]
         ser = Series(np.array(data))
-        with pytest.raises(ValueError, match="does not match format"):
+        msg = (
+            r'^time data "01-02-2011 00:00:00" at position 1 doesn\'t match format '
+            r'"%m/%d/%Y %H:%M:%S" \(match\)$'
+        )
+        with pytest.raises(ValueError, match=msg):
             to_datetime(ser, cache=cache)
 
     def test_to_datetime_consistent_format(self, cache):
@@ -2923,17 +2941,22 @@ def test_incorrect_value_exception(self):
                 to_datetime(["today", "yesterday"])
 
     @pytest.mark.parametrize(
-        "format, warning", [(None, UserWarning), ("%Y-%m-%d %H:%M:%S", None)]
+        "format, warning",
+        [
+            (None, UserWarning),
+            ("%Y-%m-%d %H:%M:%S", None),
+            ("%Y-%d-%m %H:%M:%S", None),
+        ],
     )
     def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning):
         # see gh-23830
         msg = (
-            "Out of bounds nanosecond timestamp: 2417-10-27 00:00:00 "
-            "present at position 0"
+            r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00 "
+            r"present at position 0$"
         )
         with pytest.raises(OutOfBoundsDatetime, match=msg):
             with tm.assert_produces_warning(warning, match="Could not infer format"):
-                to_datetime("2417-10-27 00:00:00", format=format)
+                to_datetime("2417-10-10 00:00:00", format=format)
 
     @pytest.mark.parametrize(
         "arg, origin, expected_str",