Skip to content

BUG: pd.to_datetime(infer_datetime_format=True) drops timezone #42068

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ Timedelta

Timezones
^^^^^^^^^
-
- Bug in :func:`to_datetime` with ``infer_datetime_format=True`` failing to parse zero UTC offset (``Z``) correctly (:issue:`41047`)
-

Numeric
Expand Down
40 changes: 35 additions & 5 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -821,15 +821,17 @@ def format_is_iso(f: str) -> bint:
Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
but must be consistent. Leading 0s in dates and times are optional.
"""
iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format
iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}'.format
excluded_formats = ['%Y%m%d', '%Y%m', '%Y']

for date_sep in [' ', '/', '\\', '-', '.', '']:
for time_sep in [' ', 'T']:
if (iso_template(date_sep=date_sep,
time_sep=time_sep
).startswith(f) and f not in excluded_formats):
return True
for micro_or_tz in ['', '%z', '%Z', '.%f', '.%f%z', '.%f%Z']:
if (iso_template(date_sep=date_sep,
time_sep=time_sep,
micro_or_tz=micro_or_tz,
).startswith(f) and f not in excluded_formats):
return True
return False


Expand Down Expand Up @@ -883,6 +885,7 @@ def guess_datetime_format(
(('second',), '%S', 2),
(('microsecond',), '%f', 6),
(('second', 'microsecond'), '%S.%f', 0),
(('tzinfo',), '%z', 0),
(('tzinfo',), '%Z', 0),
]

Expand All @@ -903,6 +906,33 @@ def guess_datetime_format(
# that any user-provided function will not either.
tokens = dt_str_split(dt_str)

# Normalize offset part of tokens.
# There are multiple formats for the timezone offset.
# To pass the comparison condition between the output of `strftime` and
# joined tokens, which is carried out at the final step of the function,
# the offset part of the tokens must match the '%z' format like '+0900'
# instead of ‘+09:00’.
if parsed_datetime.tzinfo is not None:
offset_index = None
if len(tokens) > 0 and tokens[-1] == 'Z':
# the last 'Z' means zero offset
offset_index = -1
elif len(tokens) > 1 and tokens[-2] in ('+', '-'):
# ex. [..., '+', '0900']
offset_index = -2
elif len(tokens) > 3 and tokens[-4] in ('+', '-'):
# ex. [..., '+', '09', ':', '00']
offset_index = -4

if offset_index is not None:
# If the input string has a timezone offset like '+0900',
# the offset is separated into two tokens, ex. ['+', '0900’].
# This separation will prevent subsequent processing
# from correctly parsing the time zone format.
# So in addition to the format nomalization, we rejoin them here.
tokens[offset_index] = parsed_datetime.strftime("%z")
tokens = tokens[:offset_index + 1 or None]

format_guess = [None] * len(tokens)
found_attrs = set()

Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1963,6 +1963,23 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset):
)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"ts,zero_tz,is_utc",
[
("2019-02-02 08:07:13", "Z", True),
("2019-02-02 08:07:13", "", False),
("2019-02-02 08:07:13.012345", "Z", True),
("2019-02-02 08:07:13.012345", "", False),
],
)
def test_infer_datetime_format_zero_tz(self, ts, zero_tz, is_utc):
# GH 41047
s = Series([ts + zero_tz])
result = to_datetime(s, infer_datetime_format=True)
tz = pytz.utc if is_utc else None
expected = Series([Timestamp(ts, tz=tz)])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("cache", [True, False])
def test_to_datetime_iso8601_noleading_0s(self, cache):
# GH 11871
Expand Down
50 changes: 50 additions & 0 deletions pandas/tests/tslibs/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,28 @@ def test_parsers_month_freq(date_str, expected):
("30-12-2011", "%d-%m-%Y"),
("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"),
("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"),
("2011-12-30T00:00:00UTC", "%Y-%m-%dT%H:%M:%S%Z"),
("2011-12-30T00:00:00Z", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+9", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+09", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+090", None),
("2011-12-30T00:00:00+0900", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00-0900", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+09:00", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+09:000", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+9:0", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+09:", None),
("2011-12-30T00:00:00.000000UTC", "%Y-%m-%dT%H:%M:%S.%f%Z"),
("2011-12-30T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+9", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+09", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+090", None),
("2011-12-30T00:00:00.000000+0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000-0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+09:00", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+09:000", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+09:", None),
("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"),
],
)
Expand Down Expand Up @@ -226,3 +248,31 @@ def test_parse_time_string_check_instance_type_raise_exception():
result = parse_time_string("2019")
expected = (datetime(2019, 1, 1), "year")
assert result == expected


@pytest.mark.parametrize(
"fmt,expected",
[
("%Y %m %d %H:%M:%S", True),
("%Y/%m/%d %H:%M:%S", True),
(r"%Y\%m\%d %H:%M:%S", True),
("%Y-%m-%d %H:%M:%S", True),
("%Y.%m.%d %H:%M:%S", True),
("%Y%m%d %H:%M:%S", True),
("%Y-%m-%dT%H:%M:%S", True),
("%Y-%m-%dT%H:%M:%S%z", True),
("%Y-%m-%dT%H:%M:%S%Z", True),
("%Y-%m-%dT%H:%M:%S.%f", True),
("%Y-%m-%dT%H:%M:%S.%f%z", True),
("%Y-%m-%dT%H:%M:%S.%f%Z", True),
("%Y%m%d", False),
("%Y%m", False),
("%Y", False),
("%Y-%m-%d", True),
("%Y-%m", True),
],
)
def test_is_iso_format(fmt, expected):
# see gh-41047
result = parsing.format_is_iso(fmt)
assert result == expected