pandas-dev · srotondo · Jul 15, 2022 · Jul 15, 2022 · Jul 21, 2022 · Jul 22, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -902,6 +902,7 @@ Datetimelike
 - Bug in :meth:`DatetimeIndex.resolution` incorrectly returning "day" instead of "nanosecond" for nanosecond-resolution indexes (:issue:`46903`)
 - Bug in :class:`Timestamp` with an integer or float value and ``unit="Y"`` or ``unit="M"`` giving slightly-wrong results (:issue:`47266`)
 - Bug in :class:`.DatetimeArray` construction when passed another :class:`.DatetimeArray` and ``freq=None`` incorrectly inferring the freq from the given array (:issue:`47296`)
+- Bug in :func:`to_datetime` where ``infer_datetime_format`` fallback would not run if ``errors=coerce`` (:issue:`46071`)
 - Bug in :func:`to_datetime` where ``OutOfBoundsDatetime`` would be thrown even if ``errors=coerce`` if there were more than 50 rows (:issue:`45319`)
 - Bug when adding a :class:`DateOffset` to a :class:`Series` would not add the ``nanoseconds`` field (:issue:`47856`)
 -

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -501,6 +501,10 @@ def _array_strptime_with_fallback(
         if "%Z" in fmt or "%z" in fmt:
             return _return_parsed_timezone_results(result, timezones, tz, name)
 
+        if infer_datetime_format and np.isnan(result).any():
+            # Indicates to the caller to fallback to objects_to_datetime64ns
+            return None
+
     return _box_as_indexlike(result, utc=utc, name=name)
 
 
@@ -798,7 +802,10 @@ def to_datetime(
         If :const:`True` and no `format` is given, attempt to infer the format
         of the datetime strings based on the first non-NaN element,
         and if it can be inferred, switch to a faster method of parsing them.
-        In some cases this can increase the parsing speed by ~5-10x.
+        In some cases this can increase the parsing speed by ~5-10x. If subsequent
+        datetime strings do not follow the inferred format, parsing will fall
+        back to the slower method of determining the format for each
+        string individually.
     origin : scalar, default 'unix'
         Define the reference date. The numeric values would be parsed as number
         of units (defined by `unit`) since this reference date.

diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py
@@ -3,11 +3,13 @@
 """
 import operator
 
+from dateutil.parser._parser import ParserError
 import numpy as np
 import pytest
 
 from pandas._libs.tslibs import tz_compare
 from pandas._libs.tslibs.dtypes import NpyDatetimeUnit
+from pandas.errors import OutOfBoundsDatetime
 
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
 
@@ -639,3 +641,47 @@ def test_tz_localize_t2d(self):
 
         roundtrip = expected.tz_localize("US/Pacific")
         tm.assert_datetime_array_equal(roundtrip, dta)
+
+    @pytest.mark.parametrize(
+        "error",
+        ["coerce", "raise"],
+    )
+    def test_fallback_different_formats(self, error):
     @pytest.mark.parametrize( 
         "data", 
         [ 
             ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"], 
             ["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"], 
         ], 
     ) 
     def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache, data): 
         ser = Series(np.array(data)) 
         # When the format is inconsistent, infer_datetime_format should just 
         # fallback to the default parsing 
         tm.assert_series_equal( 
             to_datetime(ser, infer_datetime_format=False, cache=cache), 
             to_datetime(ser, infer_datetime_format=True, cache=cache), 
         ) 
     @pytest.mark.parametrize( 
         "data", 
         [ 
             ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"], 
             ["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"], 
         ], 
     ) 
     def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache, data): 
         ser = Series(np.array(data)) 
  
         # When the format is inconsistent, infer_datetime_format should just 
         # fallback to the default parsing 
         tm.assert_series_equal( 
             to_datetime(ser, infer_datetime_format=False, cache=cache), 
             to_datetime(ser, infer_datetime_format=True, cache=cache), 
         ) 
+        # GH#46071
+        # 2 valid dates with different formats
+        # Should parse with no errors
+        s = pd.Series(["6/30/2025", "1 27 2024"])
+        expected = pd.Series(
+            [pd.Timestamp("2025-06-30 00:00:00"), pd.Timestamp("2024-01-27 00:00:00")]
+        )
+        result = pd.to_datetime(s, errors=error, infer_datetime_format=True)
+        tm.assert_series_equal(expected, result)
+
+    @pytest.mark.parametrize(
+        "dateseries",
+        [
+            pd.Series(["1/1/2000", "7/12/1200"]),
+            pd.Series(["1/1/2000", "Invalid input"]),
 @pytest.mark.parametrize("values", [["a"], ["00:01:99"], ["a", "b", "99:00:00"]]) 
 @pytest.mark.parametrize("infer", [True, False]) 
 @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) 
 def test_datetime_invalid_index(self, values, format, infer): 
     # GH24763 
     res = to_datetime( 
         values, errors="ignore", format=format, infer_datetime_format=infer 
     ) 
     tm.assert_index_equal(res, Index(values)) 
     res = to_datetime( 
         values, errors="coerce", format=format, infer_datetime_format=infer 
     ) 
     tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) 
     msg = ( 
         "is a bad directive in format|" 
         f"Given date string {values[0]} not likely a datetime|" 
         "second must be in 0..59" 
     ) 
     with pytest.raises(ValueError, match=msg): 
         to_datetime( 
             values, errors="raise", format=format, infer_datetime_format=infer 
         ) 
     @pytest.mark.parametrize( 
         "data", 
         [ 
             ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"], 
             ["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"], 
         ], 
     ) 
     def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache, data): 
         ser = Series(np.array(data)) 
         # When the format is inconsistent, infer_datetime_format should just 
         # fallback to the default parsing 
         tm.assert_series_equal( 
             to_datetime(ser, infer_datetime_format=False, cache=cache), 
             to_datetime(ser, infer_datetime_format=True, cache=cache), 
         ) 
 @pytest.mark.parametrize("values", [["a"], ["00:01:99"], ["a", "b", "99:00:00"]]) 
 @pytest.mark.parametrize("infer", [True, False]) 
 @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) 
 def test_datetime_invalid_index(self, values, format, infer): 
     # GH24763 
     res = to_datetime( 
         values, errors="ignore", format=format, infer_datetime_format=infer 
     ) 
     tm.assert_index_equal(res, Index(values)) 
  
     res = to_datetime( 
         values, errors="coerce", format=format, infer_datetime_format=infer 
     ) 
     tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) 
  
     msg = ( 
         "is a bad directive in format|" 
         f"Given date string {values[0]} not likely a datetime|" 
         "second must be in 0..59" 
     ) 
     with pytest.raises(ValueError, match=msg): 
         to_datetime( 
             values, errors="raise", format=format, infer_datetime_format=infer 
         ) 
     @pytest.mark.parametrize( 
         "data", 
         [ 
             ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"], 
             ["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"], 
         ], 
     ) 
     def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache, data): 
         ser = Series(np.array(data)) 
  
         # When the format is inconsistent, infer_datetime_format should just 
         # fallback to the default parsing 
         tm.assert_series_equal( 
             to_datetime(ser, infer_datetime_format=False, cache=cache), 
             to_datetime(ser, infer_datetime_format=True, cache=cache), 
         ) 
+        ],
+    )
+    def test_fallback_with_errors_coerce(self, dateseries):
+        # GH#46071
+        # Invalid inputs
+        # Parsing should fail for the second element
+        expected = pd.Series([pd.Timestamp("2000-01-01 00:00:00"), pd.NaT])
+        result = pd.to_datetime(dateseries, errors="coerce", infer_datetime_format=True)
+        tm.assert_series_equal(expected, result)
+
+    def test_fallback_with_errors_raise(self):
+        # GH#46071
+        # Invalid inputs
+        # Parsing should fail for the second element
+        dates1 = pd.Series(["1/1/2000", "7/12/1200"])
+        with pytest.raises(
+            OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp"
+        ):
+            pd.to_datetime(dates1, errors="raise", infer_datetime_format=True)
+
+        dates2 = pd.Series(["1/1/2000", "Invalid input"])
+        with pytest.raises(ParserError, match="Unknown string format: Invalid input"):
+            pd.to_datetime(dates2, errors="raise", infer_datetime_format=True)