diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a1a2149da7cf6..74e3e1093eb75 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -902,6 +902,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.resolution` incorrectly returning "day" instead of "nanosecond" for nanosecond-resolution indexes (:issue:`46903`) - Bug in :class:`Timestamp` with an integer or float value and ``unit="Y"`` or ``unit="M"`` giving slightly-wrong results (:issue:`47266`) - Bug in :class:`.DatetimeArray` construction when passed another :class:`.DatetimeArray` and ``freq=None`` incorrectly inferring the freq from the given array (:issue:`47296`) +- Bug in :func:`to_datetime` where ``infer_datetime_format`` fallback would not run if ``errors=coerce`` (:issue:`46071`) - Bug in :func:`to_datetime` where ``OutOfBoundsDatetime`` would be thrown even if ``errors=coerce`` if there were more than 50 rows (:issue:`45319`) - Bug when adding a :class:`DateOffset` to a :class:`Series` would not add the ``nanoseconds`` field (:issue:`47856`) - diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 203a5711b7a59..0063df68c595b 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -501,6 +501,10 @@ def _array_strptime_with_fallback( if "%Z" in fmt or "%z" in fmt: return _return_parsed_timezone_results(result, timezones, tz, name) + if infer_datetime_format and np.isnan(result).any(): + # Indicates to the caller to fallback to objects_to_datetime64ns + return None + return _box_as_indexlike(result, utc=utc, name=name) @@ -798,7 +802,10 @@ def to_datetime( If :const:`True` and no `format` is given, attempt to infer the format of the datetime strings based on the first non-NaN element, and if it can be inferred, switch to a faster method of parsing them. - In some cases this can increase the parsing speed by ~5-10x. + In some cases this can increase the parsing speed by ~5-10x. If subsequent + datetime strings do not follow the inferred format, parsing will fall + back to the slower method of determining the format for each + string individually. origin : scalar, default 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index af1a292a2975a..68ffff3fc93f5 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -3,11 +3,13 @@ """ import operator +from dateutil.parser._parser import ParserError import numpy as np import pytest from pandas._libs.tslibs import tz_compare from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas.errors import OutOfBoundsDatetime from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -639,3 +641,47 @@ def test_tz_localize_t2d(self): roundtrip = expected.tz_localize("US/Pacific") tm.assert_datetime_array_equal(roundtrip, dta) + + @pytest.mark.parametrize( + "error", + ["coerce", "raise"], + ) + def test_fallback_different_formats(self, error): + # GH#46071 + # 2 valid dates with different formats + # Should parse with no errors + s = pd.Series(["6/30/2025", "1 27 2024"]) + expected = pd.Series( + [pd.Timestamp("2025-06-30 00:00:00"), pd.Timestamp("2024-01-27 00:00:00")] + ) + result = pd.to_datetime(s, errors=error, infer_datetime_format=True) + tm.assert_series_equal(expected, result) + + @pytest.mark.parametrize( + "dateseries", + [ + pd.Series(["1/1/2000", "7/12/1200"]), + pd.Series(["1/1/2000", "Invalid input"]), + ], + ) + def test_fallback_with_errors_coerce(self, dateseries): + # GH#46071 + # Invalid inputs + # Parsing should fail for the second element + expected = pd.Series([pd.Timestamp("2000-01-01 00:00:00"), pd.NaT]) + result = pd.to_datetime(dateseries, errors="coerce", infer_datetime_format=True) + tm.assert_series_equal(expected, result) + + def test_fallback_with_errors_raise(self): + # GH#46071 + # Invalid inputs + # Parsing should fail for the second element + dates1 = pd.Series(["1/1/2000", "7/12/1200"]) + with pytest.raises( + OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp" + ): + pd.to_datetime(dates1, errors="raise", infer_datetime_format=True) + + dates2 = pd.Series(["1/1/2000", "Invalid input"]) + with pytest.raises(ParserError, match="Unknown string format: Invalid input"): + pd.to_datetime(dates2, errors="raise", infer_datetime_format=True)