From 0d320e9ebe921f724f1e076c7f864badc91c3b67 Mon Sep 17 00:00:00 2001 From: Steven Rotondo Date: Fri, 15 Jul 2022 11:16:28 -0700 Subject: [PATCH 1/7] BUG: Fixed behavior with fallback between raise and coerce #46071 --- pandas/core/tools/datetimes.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d4d61df915acb..8c4d0ce24c5e6 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -215,6 +215,7 @@ def _maybe_cache( cache_array : Series Cache of converted, unique dates. Can be empty """ + from pandas import Series cache_array = Series(dtype=object) @@ -391,7 +392,6 @@ def _convert_listlike_datetimes( raise TypeError( "arg must be a string, datetime, list, tuple, 1-d array, or Series" ) - # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation orig_arg = arg @@ -411,7 +411,6 @@ def _convert_listlike_datetimes( if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred @@ -428,7 +427,6 @@ def _convert_listlike_datetimes( ) if res is not None: return res - assert format is None or infer_datetime_format utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( @@ -440,7 +438,6 @@ def _convert_listlike_datetimes( require_iso8601=require_iso8601, allow_object=True, ) - if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC @@ -495,6 +492,8 @@ def _array_strptime_with_fallback( else: if "%Z" in fmt or "%z" in fmt: return _return_parsed_timezone_results(result, timezones, tz, name) + if infer_datetime_format and np.isnan(result).any(): + return None return _box_as_indexlike(result, utc=utc, name=name) @@ -513,7 +512,6 @@ def _to_datetime_with_format( Try parsing with the given format, returning None on failure. """ result = None - # shortcut formatting here if fmt == "%Y%m%d": # pass orig_arg as float-dtype may have been converted to @@ -1029,6 +1027,7 @@ def to_datetime( '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) """ + if arg is None: return None From df08fd13038f98cd7f5636c9aade02891dc4c959 Mon Sep 17 00:00:00 2001 From: Steven Rotondo Date: Fri, 15 Jul 2022 11:22:29 -0700 Subject: [PATCH 2/7] BUG: Fixed behavior with fallback between raise and coerce #46071 --- pandas/core/tools/datetimes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8c4d0ce24c5e6..7909a7bfc500c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -492,6 +492,7 @@ def _array_strptime_with_fallback( else: if "%Z" in fmt or "%z" in fmt: return _return_parsed_timezone_results(result, timezones, tz, name) + # GH#46071 if infer_datetime_format and np.isnan(result).any(): return None From 2ab558a00b04f310e9b22aa0a0adba6d1e9597a3 Mon Sep 17 00:00:00 2001 From: Steven Rotondo Date: Fri, 22 Jul 2022 16:44:59 -0700 Subject: [PATCH 3/7] BUG: Added test and release note and removed line shifts #46071 --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/tools/datetimes.py | 9 ++++++--- pandas/tests/arrays/test_datetimes.py | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 16fcb34fdb7d1..acee25a76f9f6 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -827,7 +827,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.resolution` incorrectly returning "day" instead of "nanosecond" for nanosecond-resolution indexes (:issue:`46903`) - Bug in :class:`Timestamp` with an integer or float value and ``unit="Y"`` or ``unit="M"`` giving slightly-wrong results (:issue:`47266`) - Bug in :class:`.DatetimeArray` construction when passed another :class:`.DatetimeArray` and ``freq=None`` incorrectly inferring the freq from the given array (:issue:`47296`) -- +- Bug in :func:`to_datetime` where ``infer_datetime_format`` fallback would not run if ``errors=coerce`` (:issue:`46071`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1156f4e4d3254..782c51a979945 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -215,7 +215,6 @@ def _maybe_cache( cache_array : Series Cache of converted, unique dates. Can be empty """ - from pandas import Series cache_array = Series(dtype=object) @@ -392,6 +391,7 @@ def _convert_listlike_datetimes( raise TypeError( "arg must be a string, datetime, list, tuple, 1-d array, or Series" ) + # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation orig_arg = arg @@ -411,6 +411,7 @@ def _convert_listlike_datetimes( if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) + if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred @@ -427,6 +428,7 @@ def _convert_listlike_datetimes( ) if res is not None: return res + assert format is None or infer_datetime_format utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( @@ -438,6 +440,7 @@ def _convert_listlike_datetimes( require_iso8601=require_iso8601, allow_object=True, ) + if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC @@ -492,7 +495,7 @@ def _array_strptime_with_fallback( else: if "%Z" in fmt or "%z" in fmt: return _return_parsed_timezone_results(result, timezones, tz, name) - # GH#46071 + if infer_datetime_format and np.isnan(result).any(): return None @@ -513,6 +516,7 @@ def _to_datetime_with_format( Try parsing with the given format, returning None on failure. """ result = None + # shortcut formatting here if fmt == "%Y%m%d": # pass orig_arg as float-dtype may have been converted to @@ -1028,7 +1032,6 @@ def to_datetime( '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) """ - if arg is None: return None diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index af1a292a2975a..cff19e04842f8 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -639,3 +639,21 @@ def test_tz_localize_t2d(self): roundtrip = expected.tz_localize("US/Pacific") tm.assert_datetime_array_equal(roundtrip, dta) + + @pytest.mark.parametrize( + "error", + ["coerce", "raise"], + ) + def test_coerce_fallback(self, error): + # GH#46071 + s = pd.Series(["6/30/2025", "1 27 2024"]) + expected = pd.Series( + [pd.Timestamp("2025-06-30 00:00:00"), pd.Timestamp("2024-01-27 00:00:00")] + ) + + result = pd.to_datetime(s, errors=error, infer_datetime_format=True) + + if error == "coerce": + assert result[1] is not pd.NaT + + tm.assert_series_equal(expected, result) From 78e6bc770c11542019f08978395db2764eedcabb Mon Sep 17 00:00:00 2001 From: Steven Rotondo Date: Thu, 11 Aug 2022 09:22:06 -0700 Subject: [PATCH 4/7] BUG: Edited documentation and improved tests #46071 --- pandas/core/tools/datetimes.py | 5 ++++- pandas/tests/arrays/test_datetimes.py | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 782c51a979945..2a23fa98a2506 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -796,7 +796,10 @@ def to_datetime( If :const:`True` and no `format` is given, attempt to infer the format of the datetime strings based on the first non-NaN element, and if it can be inferred, switch to a faster method of parsing them. - In some cases this can increase the parsing speed by ~5-10x. + In some cases this can increase the parsing speed by ~5-10x. If subsequent + datetime strings do not follow the inferred format, parsing will fall + back to the slower method of determining the format for each + string individually. origin : scalar, default 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index cff19e04842f8..556fc1ee6e1b1 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -3,11 +3,13 @@ """ import operator +from dateutil.parser._parser import ParserError import numpy as np import pytest from pandas._libs.tslibs import tz_compare from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas.errors import OutOfBoundsDatetime from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -657,3 +659,22 @@ def test_coerce_fallback(self, error): assert result[1] is not pd.NaT tm.assert_series_equal(expected, result) + + expected2 = pd.Series([pd.Timestamp("2000-01-01 00:00:00"), pd.NaT]) + + es1 = pd.Series(["1/1/2000", "7/12/1200"]) + es2 = pd.Series(["1/1/2000", "Hello"]) + + if error == "coerce": + eres1 = pd.to_datetime(es1, errors=error, infer_datetime_format=True) + eres2 = pd.to_datetime(es2, errors=error, infer_datetime_format=True) + tm.assert_series_equal(expected2, eres1) + tm.assert_series_equal(expected2, eres2) + else: + with pytest.raises( + OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp" + ): + pd.to_datetime(es1, errors=error, infer_datetime_format=True) + + with pytest.raises(ParserError, match="Unknown string format: Hello"): + pd.to_datetime(es2, errors=error, infer_datetime_format=True) From ef1f73692d3dfb6f6f92adfc431a1c82ce470ab8 Mon Sep 17 00:00:00 2001 From: Steven Rotondo Date: Wed, 17 Aug 2022 14:42:46 -0700 Subject: [PATCH 5/7] BUG: Edited test structure #46071 --- pandas/tests/arrays/test_datetimes.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 556fc1ee6e1b1..66780265324dc 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -648,27 +648,26 @@ def test_tz_localize_t2d(self): ) def test_coerce_fallback(self, error): # GH#46071 + # 2 valid dates with different formats + # Should parse with no errors s = pd.Series(["6/30/2025", "1 27 2024"]) expected = pd.Series( [pd.Timestamp("2025-06-30 00:00:00"), pd.Timestamp("2024-01-27 00:00:00")] ) - result = pd.to_datetime(s, errors=error, infer_datetime_format=True) - - if error == "coerce": - assert result[1] is not pd.NaT - tm.assert_series_equal(expected, result) + # Invalid inputs + # Errors should be raised for the second element expected2 = pd.Series([pd.Timestamp("2000-01-01 00:00:00"), pd.NaT]) - + # Out of bounds date es1 = pd.Series(["1/1/2000", "7/12/1200"]) - es2 = pd.Series(["1/1/2000", "Hello"]) - + # Invalid input string + es2 = pd.Series(["1/1/2000", "Invalid input"]) if error == "coerce": eres1 = pd.to_datetime(es1, errors=error, infer_datetime_format=True) - eres2 = pd.to_datetime(es2, errors=error, infer_datetime_format=True) tm.assert_series_equal(expected2, eres1) + eres2 = pd.to_datetime(es2, errors=error, infer_datetime_format=True) tm.assert_series_equal(expected2, eres2) else: with pytest.raises( @@ -676,5 +675,7 @@ def test_coerce_fallback(self, error): ): pd.to_datetime(es1, errors=error, infer_datetime_format=True) - with pytest.raises(ParserError, match="Unknown string format: Hello"): + with pytest.raises( + ParserError, match="Unknown string format: Invalid input" + ): pd.to_datetime(es2, errors=error, infer_datetime_format=True) From 7347c780b8b79175be9d173d9510e3f25a29ef6e Mon Sep 17 00:00:00 2001 From: Steven Rotondo Date: Fri, 2 Sep 2022 11:07:35 -0700 Subject: [PATCH 6/7] BUG: Split up tests #46071 --- pandas/tests/arrays/test_datetimes.py | 48 +++++++++++++++------------ 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 66780265324dc..68ffff3fc93f5 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -646,7 +646,7 @@ def test_tz_localize_t2d(self): "error", ["coerce", "raise"], ) - def test_coerce_fallback(self, error): + def test_fallback_different_formats(self, error): # GH#46071 # 2 valid dates with different formats # Should parse with no errors @@ -657,25 +657,31 @@ def test_coerce_fallback(self, error): result = pd.to_datetime(s, errors=error, infer_datetime_format=True) tm.assert_series_equal(expected, result) + @pytest.mark.parametrize( + "dateseries", + [ + pd.Series(["1/1/2000", "7/12/1200"]), + pd.Series(["1/1/2000", "Invalid input"]), + ], + ) + def test_fallback_with_errors_coerce(self, dateseries): + # GH#46071 # Invalid inputs - # Errors should be raised for the second element - expected2 = pd.Series([pd.Timestamp("2000-01-01 00:00:00"), pd.NaT]) - # Out of bounds date - es1 = pd.Series(["1/1/2000", "7/12/1200"]) - # Invalid input string - es2 = pd.Series(["1/1/2000", "Invalid input"]) - if error == "coerce": - eres1 = pd.to_datetime(es1, errors=error, infer_datetime_format=True) - tm.assert_series_equal(expected2, eres1) - eres2 = pd.to_datetime(es2, errors=error, infer_datetime_format=True) - tm.assert_series_equal(expected2, eres2) - else: - with pytest.raises( - OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp" - ): - pd.to_datetime(es1, errors=error, infer_datetime_format=True) + # Parsing should fail for the second element + expected = pd.Series([pd.Timestamp("2000-01-01 00:00:00"), pd.NaT]) + result = pd.to_datetime(dateseries, errors="coerce", infer_datetime_format=True) + tm.assert_series_equal(expected, result) - with pytest.raises( - ParserError, match="Unknown string format: Invalid input" - ): - pd.to_datetime(es2, errors=error, infer_datetime_format=True) + def test_fallback_with_errors_raise(self): + # GH#46071 + # Invalid inputs + # Parsing should fail for the second element + dates1 = pd.Series(["1/1/2000", "7/12/1200"]) + with pytest.raises( + OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp" + ): + pd.to_datetime(dates1, errors="raise", infer_datetime_format=True) + + dates2 = pd.Series(["1/1/2000", "Invalid input"]) + with pytest.raises(ParserError, match="Unknown string format: Invalid input"): + pd.to_datetime(dates2, errors="raise", infer_datetime_format=True) From 3b08c718f2ba062417ae39710e23dab7bb7b00eb Mon Sep 17 00:00:00 2001 From: Steven Rotondo <97266896+srotondo@users.noreply.github.com> Date: Sun, 18 Sep 2022 14:06:28 -0700 Subject: [PATCH 7/7] Update pandas/core/tools/datetimes.py Co-authored-by: Marco Edward Gorelli --- pandas/core/tools/datetimes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 6417db7b3974d..0063df68c595b 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -502,6 +502,7 @@ def _array_strptime_with_fallback( return _return_parsed_timezone_results(result, timezones, tz, name) if infer_datetime_format and np.isnan(result).any(): + # Indicates to the caller to fallback to objects_to_datetime64ns return None return _box_as_indexlike(result, utc=utc, name=name)