diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 1fb9a81e85a83..2a2941c7ff9a5 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -637,6 +637,7 @@ Datetimelike - Bug in subtracting a ``datetime`` scalar from :class:`DatetimeIndex` failing to retain the original ``freq`` attribute (:issue:`48818`) - Bug in ``pandas.tseries.holiday.Holiday`` where a half-open date interval causes inconsistent return types from :meth:`USFederalHolidayCalendar.holidays` (:issue:`49075`) - Bug in rendering :class:`DatetimeIndex` and :class:`Series` and :class:`DataFrame` with timezone-aware dtypes with ``dateutil`` or ``zoneinfo`` timezones near daylight-savings transitions (:issue:`49684`) +- Bug in :func:`to_datetime` was raising ``ValueError`` when parsing :class:`Timestamp` or ``datetime`` objects with non-ISO8601 ``format`` (:issue:`49298`) - Timedelta diff --git a/pandas/_libs/tslibs/strptime.pyi b/pandas/_libs/tslibs/strptime.pyi index 8e1acb2ff0d38..4565bb7ecf959 100644 --- a/pandas/_libs/tslibs/strptime.pyi +++ b/pandas/_libs/tslibs/strptime.pyi @@ -7,6 +7,7 @@ def array_strptime( fmt: str | None, exact: bool = ..., errors: str = ..., + utc: bool = ..., ) -> tuple[np.ndarray, np.ndarray]: ... # first ndarray is M8[ns], second is object ndarray of tzinfo | None diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index f540ad19c48d2..79944bc86a8cf 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -1,10 +1,14 @@ """Strptime-related classes and functions. """ from cpython.datetime cimport ( + PyDateTime_Check, date, + import_datetime, tzinfo, ) +import_datetime() + from _thread import allocate_lock as _thread_allocate_lock import numpy as np @@ -16,6 +20,7 @@ from numpy cimport ( ) from pandas._libs.missing cimport checknull_with_nat_and_na +from pandas._libs.tslibs.conversion cimport convert_timezone from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_nat_strings as nat_strings, @@ -25,7 +30,9 @@ from pandas._libs.tslibs.np_datetime cimport ( check_dts_bounds, npy_datetimestruct, npy_datetimestruct_to_datetime, + pydatetime_to_dt64, ) +from pandas._libs.tslibs.timestamps cimport _Timestamp cdef dict _parse_code_table = {'y': 0, @@ -53,7 +60,13 @@ cdef dict _parse_code_table = {'y': 0, 'u': 22} -def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='raise'): +def array_strptime( + ndarray[object] values, + str fmt, + bint exact=True, + errors='raise', + bint utc=False, +): """ Calculates the datetime structs represented by the passed array of strings @@ -78,6 +91,9 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai bint is_raise = errors=='raise' bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' + bint found_naive = False + bint found_tz = False + tzinfo tz_out = None assert is_raise or is_ignore or is_coerce @@ -128,12 +144,30 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai if val in nat_strings: iresult[i] = NPY_NAT continue - else: - if checknull_with_nat_and_na(val): - iresult[i] = NPY_NAT - continue + elif checknull_with_nat_and_na(val): + iresult[i] = NPY_NAT + continue + elif PyDateTime_Check(val): + if val.tzinfo is not None: + found_tz = True + else: + found_naive = True + tz_out = convert_timezone( + val.tzinfo, + tz_out, + found_naive, + found_tz, + utc, + ) + if isinstance(val, _Timestamp): + iresult[i] = val.tz_localize(None).as_unit("ns").value else: - val = str(val) + iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts) + check_dts_bounds(&dts) + result_timezone[i] = val.tzinfo + continue + else: + val = str(val) # exact matching if exact: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 97d054df8287f..430343beb630b 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -318,7 +318,14 @@ def _return_parsed_timezone_results( ) if utc: # Convert to the same tz - tz_results = np.array([tz_result.tz_convert("utc") for tz_result in tz_results]) + tz_results = np.array( + [ + tz_result.tz_convert("utc") + if tz_result.tzinfo is not None + else tz_result.tz_localize("utc") + for tz_result in tz_results + ] + ) return Index(tz_results, name=name) @@ -468,7 +475,9 @@ def _array_strptime_with_fallback( Call array_strptime, with fallback behavior depending on 'errors'. """ try: - result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors) + result, timezones = array_strptime( + arg, fmt, exact=exact, errors=errors, utc=utc + ) except OutOfBoundsDatetime: if errors == "raise": raise @@ -495,7 +504,7 @@ def _array_strptime_with_fallback( # Indicates to the caller to fallback to objects_to_datetime64ns return None else: - if "%Z" in fmt or "%z" in fmt: + if any(tz is not None for tz in timezones): return _return_parsed_timezone_results(result, timezones, utc, name) return _box_as_indexlike(result, utc=utc, name=name) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 4c70aeb3e36aa..ec4fbfeee2c3d 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -469,6 +469,88 @@ def test_to_datetime_mixed_datetime_and_string(self): expected = to_datetime([d1, d2]).tz_convert(pytz.FixedOffset(-60)) tm.assert_index_equal(res, expected) + @pytest.mark.parametrize( + "fmt", + ["%Y-%d-%m %H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z"], + ids=["non-ISO8601 format", "ISO8601 format"], + ) + @pytest.mark.parametrize( + "utc, args, expected", + [ + pytest.param( + True, + ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-08:00"], + DatetimeIndex( + ["2000-01-01 09:00:00+00:00", "2000-01-01 10:00:00+00:00"], + dtype="datetime64[ns, UTC]", + ), + id="all tz-aware, with utc", + ), + pytest.param( + False, + ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], + DatetimeIndex( + ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], + ), + id="all tz-aware, without utc", + ), + pytest.param( + True, + ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00+00:00"], + DatetimeIndex( + ["2000-01-01 09:00:00+00:00", "2000-01-01 02:00:00+00:00"], + dtype="datetime64[ns, UTC]", + ), + id="all tz-aware, mixed offsets, with utc", + ), + ], + ) + @pytest.mark.parametrize( + "constructor", + [Timestamp, lambda x: Timestamp(x).to_pydatetime()], + ) + def test_to_datetime_mixed_datetime_and_string_with_format( + self, fmt, utc, args, expected, constructor + ): + # https://github.com/pandas-dev/pandas/issues/49298 + # note: ISO8601 formats go down a fastpath, so we need to check both + # a ISO8601 format and a non-ISO8601 one + ts1 = constructor(args[0]) + ts2 = args[1] + result = to_datetime([ts1, ts2], format=fmt, utc=utc) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "fmt", + ["%Y-%d-%m %H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z"], + ids=["non-ISO8601 format", "ISO8601 format"], + ) + @pytest.mark.parametrize( + "args", + [ + pytest.param( + ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-07:00"], + id="all tz-aware, mixed timezones, without utc", + ), + ], + ) + @pytest.mark.parametrize( + "constructor", + [Timestamp, lambda x: Timestamp(x).to_pydatetime()], + ) + def test_to_datetime_mixed_datetime_and_string_with_format_raises( + self, fmt, args, constructor + ): + # https://github.com/pandas-dev/pandas/issues/49298 + # note: ISO8601 formats go down a fastpath, so we need to check both + # a ISO8601 format and a non-ISO8601 one + ts1 = constructor(args[0]) + ts2 = constructor(args[1]) + with pytest.raises( + ValueError, match="cannot be converted to datetime64 unless utc=True" + ): + to_datetime([ts1, ts2], format=fmt, utc=False) + @pytest.mark.parametrize("infer_datetime_format", [True, False]) def test_to_datetime_np_str(self, infer_datetime_format): # GH#32264