Skip to content

Commit 2804681

Browse files
MarcoGorelliaaossa
andauthored
BUG: Do not fail when parsing pydatetime objects in pd.to_datetime (#49893)
* parent 0168e27 author Antonio Ossa Guerra <[email protected]> 1666800993 -0300 committer MarcoGorelli <> 1669293453 +0000 Parse `datetime` properly in `pd.to_datetime` When applying `pd.to_datetime` on array-like structure that contain a `datetime.datetime` object, while using the `format` argument, a `ValueError` is raised because the `datetime.datetime` object does not match the expected format. The implemented solution looks for `datetime.datetime` instances in the `array_strptime` method. If an instance of this type is found, it's properly handled by the new `_parse_python_datetime_object`, which returns the expected Numpy datetime object. Signed-off-by: Antonio Ossa Guerra <[email protected]> * fixup * 🏷️ typing * ignore pylint nitpick * better naming * keep use-a-generator check * use fromisoformat * change awareness to be UTC * Revert "use fromisoformat" This reverts commit af272e1. * rename input to args Signed-off-by: Antonio Ossa Guerra <[email protected]> Co-authored-by: Antonio Ossa Guerra <[email protected]> Co-authored-by: MarcoGorelli <>
1 parent c65e3d2 commit 2804681

File tree

5 files changed

+136
-9
lines changed

5 files changed

+136
-9
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,7 @@ Datetimelike
638638
- Bug in subtracting a ``datetime`` scalar from :class:`DatetimeIndex` failing to retain the original ``freq`` attribute (:issue:`48818`)
639639
- Bug in ``pandas.tseries.holiday.Holiday`` where a half-open date interval causes inconsistent return types from :meth:`USFederalHolidayCalendar.holidays` (:issue:`49075`)
640640
- Bug in rendering :class:`DatetimeIndex` and :class:`Series` and :class:`DataFrame` with timezone-aware dtypes with ``dateutil`` or ``zoneinfo`` timezones near daylight-savings transitions (:issue:`49684`)
641+
- Bug in :func:`to_datetime` was raising ``ValueError`` when parsing :class:`Timestamp` or ``datetime`` objects with non-ISO8601 ``format`` (:issue:`49298`)
641642
-
642643

643644
Timedelta

pandas/_libs/tslibs/strptime.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ def array_strptime(
77
fmt: str | None,
88
exact: bool = ...,
99
errors: str = ...,
10+
utc: bool = ...,
1011
) -> tuple[np.ndarray, np.ndarray]: ...
1112

1213
# first ndarray is M8[ns], second is object ndarray of tzinfo | None

pandas/_libs/tslibs/strptime.pyx

+40-6
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
"""Strptime-related classes and functions.
22
"""
33
from cpython.datetime cimport (
4+
PyDateTime_Check,
45
date,
6+
import_datetime,
57
tzinfo,
68
)
79

10+
import_datetime()
11+
812
from _thread import allocate_lock as _thread_allocate_lock
913

1014
import numpy as np
@@ -16,6 +20,7 @@ from numpy cimport (
1620
)
1721

1822
from pandas._libs.missing cimport checknull_with_nat_and_na
23+
from pandas._libs.tslibs.conversion cimport convert_timezone
1924
from pandas._libs.tslibs.nattype cimport (
2025
NPY_NAT,
2126
c_nat_strings as nat_strings,
@@ -25,7 +30,9 @@ from pandas._libs.tslibs.np_datetime cimport (
2530
check_dts_bounds,
2631
npy_datetimestruct,
2732
npy_datetimestruct_to_datetime,
33+
pydatetime_to_dt64,
2834
)
35+
from pandas._libs.tslibs.timestamps cimport _Timestamp
2936

3037

3138
cdef dict _parse_code_table = {'y': 0,
@@ -53,7 +60,13 @@ cdef dict _parse_code_table = {'y': 0,
5360
'u': 22}
5461

5562

56-
def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='raise'):
63+
def array_strptime(
64+
ndarray[object] values,
65+
str fmt,
66+
bint exact=True,
67+
errors='raise',
68+
bint utc=False,
69+
):
5770
"""
5871
Calculates the datetime structs represented by the passed array of strings
5972
@@ -78,6 +91,9 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai
7891
bint is_raise = errors=='raise'
7992
bint is_ignore = errors=='ignore'
8093
bint is_coerce = errors=='coerce'
94+
bint found_naive = False
95+
bint found_tz = False
96+
tzinfo tz_out = None
8197

8298
assert is_raise or is_ignore or is_coerce
8399

@@ -128,12 +144,30 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai
128144
if val in nat_strings:
129145
iresult[i] = NPY_NAT
130146
continue
131-
else:
132-
if checknull_with_nat_and_na(val):
133-
iresult[i] = NPY_NAT
134-
continue
147+
elif checknull_with_nat_and_na(val):
148+
iresult[i] = NPY_NAT
149+
continue
150+
elif PyDateTime_Check(val):
151+
if val.tzinfo is not None:
152+
found_tz = True
153+
else:
154+
found_naive = True
155+
tz_out = convert_timezone(
156+
val.tzinfo,
157+
tz_out,
158+
found_naive,
159+
found_tz,
160+
utc,
161+
)
162+
if isinstance(val, _Timestamp):
163+
iresult[i] = val.tz_localize(None).as_unit("ns").value
135164
else:
136-
val = str(val)
165+
iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts)
166+
check_dts_bounds(&dts)
167+
result_timezone[i] = val.tzinfo
168+
continue
169+
else:
170+
val = str(val)
137171

138172
# exact matching
139173
if exact:

pandas/core/tools/datetimes.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,14 @@ def _return_parsed_timezone_results(
318318
)
319319
if utc:
320320
# Convert to the same tz
321-
tz_results = np.array([tz_result.tz_convert("utc") for tz_result in tz_results])
321+
tz_results = np.array(
322+
[
323+
tz_result.tz_convert("utc")
324+
if tz_result.tzinfo is not None
325+
else tz_result.tz_localize("utc")
326+
for tz_result in tz_results
327+
]
328+
)
322329

323330
return Index(tz_results, name=name)
324331

@@ -468,7 +475,9 @@ def _array_strptime_with_fallback(
468475
Call array_strptime, with fallback behavior depending on 'errors'.
469476
"""
470477
try:
471-
result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors)
478+
result, timezones = array_strptime(
479+
arg, fmt, exact=exact, errors=errors, utc=utc
480+
)
472481
except OutOfBoundsDatetime:
473482
if errors == "raise":
474483
raise
@@ -495,7 +504,7 @@ def _array_strptime_with_fallback(
495504
# Indicates to the caller to fallback to objects_to_datetime64ns
496505
return None
497506
else:
498-
if "%Z" in fmt or "%z" in fmt:
507+
if any(tz is not None for tz in timezones):
499508
return _return_parsed_timezone_results(result, timezones, utc, name)
500509

501510
return _box_as_indexlike(result, utc=utc, name=name)

pandas/tests/tools/test_to_datetime.py

+82
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,88 @@ def test_to_datetime_mixed_datetime_and_string(self):
469469
expected = to_datetime([d1, d2]).tz_convert(pytz.FixedOffset(-60))
470470
tm.assert_index_equal(res, expected)
471471

472+
@pytest.mark.parametrize(
473+
"fmt",
474+
["%Y-%d-%m %H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z"],
475+
ids=["non-ISO8601 format", "ISO8601 format"],
476+
)
477+
@pytest.mark.parametrize(
478+
"utc, args, expected",
479+
[
480+
pytest.param(
481+
True,
482+
["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-08:00"],
483+
DatetimeIndex(
484+
["2000-01-01 09:00:00+00:00", "2000-01-01 10:00:00+00:00"],
485+
dtype="datetime64[ns, UTC]",
486+
),
487+
id="all tz-aware, with utc",
488+
),
489+
pytest.param(
490+
False,
491+
["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"],
492+
DatetimeIndex(
493+
["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"],
494+
),
495+
id="all tz-aware, without utc",
496+
),
497+
pytest.param(
498+
True,
499+
["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00+00:00"],
500+
DatetimeIndex(
501+
["2000-01-01 09:00:00+00:00", "2000-01-01 02:00:00+00:00"],
502+
dtype="datetime64[ns, UTC]",
503+
),
504+
id="all tz-aware, mixed offsets, with utc",
505+
),
506+
],
507+
)
508+
@pytest.mark.parametrize(
509+
"constructor",
510+
[Timestamp, lambda x: Timestamp(x).to_pydatetime()],
511+
)
512+
def test_to_datetime_mixed_datetime_and_string_with_format(
513+
self, fmt, utc, args, expected, constructor
514+
):
515+
# https://github.com/pandas-dev/pandas/issues/49298
516+
# note: ISO8601 formats go down a fastpath, so we need to check both
517+
# a ISO8601 format and a non-ISO8601 one
518+
ts1 = constructor(args[0])
519+
ts2 = args[1]
520+
result = to_datetime([ts1, ts2], format=fmt, utc=utc)
521+
tm.assert_index_equal(result, expected)
522+
523+
@pytest.mark.parametrize(
524+
"fmt",
525+
["%Y-%d-%m %H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z"],
526+
ids=["non-ISO8601 format", "ISO8601 format"],
527+
)
528+
@pytest.mark.parametrize(
529+
"args",
530+
[
531+
pytest.param(
532+
["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-07:00"],
533+
id="all tz-aware, mixed timezones, without utc",
534+
),
535+
],
536+
)
537+
@pytest.mark.parametrize(
538+
"constructor",
539+
[Timestamp, lambda x: Timestamp(x).to_pydatetime()],
540+
)
541+
def test_to_datetime_mixed_datetime_and_string_with_format_raises(
542+
self, fmt, args, constructor
543+
):
544+
# https://github.com/pandas-dev/pandas/issues/49298
545+
# note: ISO8601 formats go down a fastpath, so we need to check both
546+
# a ISO8601 format and a non-ISO8601 one
547+
ts1 = constructor(args[0])
548+
ts2 = constructor(args[1])
549+
with pytest.raises(
550+
ValueError, match="cannot be converted to datetime64 unless utc=True"
551+
):
552+
to_datetime([ts1, ts2], format=fmt, utc=False)
553+
472554
@pytest.mark.parametrize("infer_datetime_format", [True, False])
473555
def test_to_datetime_np_str(self, infer_datetime_format):
474556
# GH#32264

0 commit comments

Comments
 (0)