Skip to content

BUG: Do not fail when parsing pydatetime objects in pd.to_datetime #49893

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Dec 1, 2022
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,7 @@ Datetimelike
- Bug in subtracting a ``datetime`` scalar from :class:`DatetimeIndex` failing to retain the original ``freq`` attribute (:issue:`48818`)
- Bug in ``pandas.tseries.holiday.Holiday`` where a half-open date interval causes inconsistent return types from :meth:`USFederalHolidayCalendar.holidays` (:issue:`49075`)
- Bug in rendering :class:`DatetimeIndex` and :class:`Series` and :class:`DataFrame` with timezone-aware dtypes with ``dateutil`` or ``zoneinfo`` timezones near daylight-savings transitions (:issue:`49684`)
- Bug in :func:`to_datetime` was raising ``ValueError`` when parsing :class:`Timestamp` or ``datetime`` objects with non-ISO8601 ``format`` (:issue:`49298`)
-

Timedelta
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/strptime.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def array_strptime(
fmt: str | None,
exact: bool = ...,
errors: str = ...,
utc: bool = ...,
) -> tuple[np.ndarray, np.ndarray]: ...

# first ndarray is M8[ns], second is object ndarray of tzinfo | None
46 changes: 40 additions & 6 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
"""Strptime-related classes and functions.
"""
from cpython.datetime cimport (
PyDateTime_Check,
date,
import_datetime,
tzinfo,
)

import_datetime()

from _thread import allocate_lock as _thread_allocate_lock

import numpy as np
Expand All @@ -16,6 +20,7 @@ from numpy cimport (
)

from pandas._libs.missing cimport checknull_with_nat_and_na
from pandas._libs.tslibs.conversion cimport convert_timezone
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
c_nat_strings as nat_strings,
Expand All @@ -25,7 +30,9 @@ from pandas._libs.tslibs.np_datetime cimport (
check_dts_bounds,
npy_datetimestruct,
npy_datetimestruct_to_datetime,
pydatetime_to_dt64,
)
from pandas._libs.tslibs.timestamps cimport _Timestamp


cdef dict _parse_code_table = {'y': 0,
Expand Down Expand Up @@ -53,7 +60,13 @@ cdef dict _parse_code_table = {'y': 0,
'u': 22}


def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='raise'):
def array_strptime(
ndarray[object] values,
str fmt,
bint exact=True,
errors='raise',
bint utc=False,
):
"""
Calculates the datetime structs represented by the passed array of strings

Expand All @@ -78,6 +91,9 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai
bint is_raise = errors=='raise'
bint is_ignore = errors=='ignore'
bint is_coerce = errors=='coerce'
bint found_naive = False
bint found_tz = False
tzinfo tz_out = None

assert is_raise or is_ignore or is_coerce

Expand Down Expand Up @@ -128,12 +144,30 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai
if val in nat_strings:
iresult[i] = NPY_NAT
continue
else:
if checknull_with_nat_and_na(val):
iresult[i] = NPY_NAT
continue
elif checknull_with_nat_and_na(val):
iresult[i] = NPY_NAT
continue
elif PyDateTime_Check(val):
if val.tzinfo is not None:
found_tz = True
else:
found_naive = True
tz_out = convert_timezone(
val.tzinfo,
tz_out,
found_naive,
found_tz,
utc,
)
if isinstance(val, _Timestamp):
iresult[i] = val.tz_localize(None).as_unit("ns").value
else:
val = str(val)
iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts)
check_dts_bounds(&dts)
result_timezone[i] = val.tzinfo
continue
else:
val = str(val)

# exact matching
if exact:
Expand Down
15 changes: 12 additions & 3 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,14 @@ def _return_parsed_timezone_results(
)
if utc:
# Convert to the same tz
tz_results = np.array([tz_result.tz_convert("utc") for tz_result in tz_results])
tz_results = np.array(
[
tz_result.tz_convert("utc")
if tz_result.tzinfo is not None
else tz_result.tz_localize("utc")
for tz_result in tz_results
]
)

return Index(tz_results, name=name)

Expand Down Expand Up @@ -468,7 +475,9 @@ def _array_strptime_with_fallback(
Call array_strptime, with fallback behavior depending on 'errors'.
"""
try:
result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors)
result, timezones = array_strptime(
arg, fmt, exact=exact, errors=errors, utc=utc
)
except OutOfBoundsDatetime:
if errors == "raise":
raise
Expand All @@ -495,7 +504,7 @@ def _array_strptime_with_fallback(
# Indicates to the caller to fallback to objects_to_datetime64ns
return None
else:
if "%Z" in fmt or "%z" in fmt:
if any(tz is not None for tz in timezones):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the difference here for if all the strings are tznaive but we saw a tzaware datetime object?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's right

return _return_parsed_timezone_results(result, timezones, utc, name)

return _box_as_indexlike(result, utc=utc, name=name)
Expand Down
82 changes: 82 additions & 0 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,88 @@ def test_to_datetime_mixed_datetime_and_string(self):
expected = to_datetime([d1, d2]).tz_convert(pytz.FixedOffset(-60))
tm.assert_index_equal(res, expected)

@pytest.mark.parametrize(
"fmt",
["%Y-%d-%m %H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z"],
ids=["non-ISO8601 format", "ISO8601 format"],
)
@pytest.mark.parametrize(
"utc, args, expected",
[
pytest.param(
True,
["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-08:00"],
DatetimeIndex(
["2000-01-01 09:00:00+00:00", "2000-01-01 10:00:00+00:00"],
dtype="datetime64[ns, UTC]",
),
id="all tz-aware, with utc",
),
pytest.param(
False,
["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"],
DatetimeIndex(
["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"],
),
id="all tz-aware, without utc",
),
pytest.param(
True,
["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00+00:00"],
DatetimeIndex(
["2000-01-01 09:00:00+00:00", "2000-01-01 02:00:00+00:00"],
dtype="datetime64[ns, UTC]",
),
id="all tz-aware, mixed offsets, with utc",
),
],
)
@pytest.mark.parametrize(
"constructor",
[Timestamp, lambda x: Timestamp(x).to_pydatetime()],
)
def test_to_datetime_mixed_datetime_and_string_with_format(
self, fmt, utc, args, expected, constructor
):
# https://github.com/pandas-dev/pandas/issues/49298
# note: ISO8601 formats go down a fastpath, so we need to check both
# a ISO8601 format and a non-ISO8601 one
ts1 = constructor(args[0])
ts2 = args[1]
result = to_datetime([ts1, ts2], format=fmt, utc=utc)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize(
"fmt",
["%Y-%d-%m %H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z"],
ids=["non-ISO8601 format", "ISO8601 format"],
)
@pytest.mark.parametrize(
"args",
[
pytest.param(
["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-07:00"],
id="all tz-aware, mixed timezones, without utc",
),
],
)
@pytest.mark.parametrize(
"constructor",
[Timestamp, lambda x: Timestamp(x).to_pydatetime()],
)
def test_to_datetime_mixed_datetime_and_string_with_format_raises(
self, fmt, args, constructor
):
# https://github.com/pandas-dev/pandas/issues/49298
# note: ISO8601 formats go down a fastpath, so we need to check both
# a ISO8601 format and a non-ISO8601 one
ts1 = constructor(args[0])
ts2 = constructor(args[1])
with pytest.raises(
ValueError, match="cannot be converted to datetime64 unless utc=True"
):
to_datetime([ts1, ts2], format=fmt, utc=False)

@pytest.mark.parametrize("infer_datetime_format", [True, False])
def test_to_datetime_np_str(self, infer_datetime_format):
# GH#32264
Expand Down