Skip to content

BUG: pd.to_datetime(infer_datetime_format=True) drops timezone #42068

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -932,6 +932,7 @@ Timezones
^^^^^^^^^
- Bug in different ``tzinfo`` objects representing UTC not being treated as equivalent (:issue:`39216`)
- Bug in ``dateutil.tz.gettz("UTC")`` not being recognized as equivalent to other UTC-representing tzinfos (:issue:`39276`)
- Bug in :func:`to_datetime` with ``infer_datetime_format=True`` failing to parse zero UTC offset (``Z``) correctly (:issue:`41047`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the delay. This PR didn't make it into 1.3. Could you move this to 1.4.0 whatsnew?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke OK!
Should I merge the current master and then fix the whatsnew file?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that would be great, thank you.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed whatsnew entry.


Numeric
^^^^^^^
Expand Down
59 changes: 54 additions & 5 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -822,15 +822,17 @@ def format_is_iso(f: str) -> bint:
Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
but must be consistent. Leading 0s in dates and times are optional.
"""
iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format
iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}'.format
excluded_formats = ['%Y%m%d', '%Y%m', '%Y']

for date_sep in [' ', '/', '\\', '-', '.', '']:
for time_sep in [' ', 'T']:
if (iso_template(date_sep=date_sep,
time_sep=time_sep
).startswith(f) and f not in excluded_formats):
return True
for micro_or_tz in ['', '%z', '%Z', '.%f', '.%f%z', '.%f%Z']:
if (iso_template(date_sep=date_sep,
time_sep=time_sep,
micro_or_tz=micro_or_tz,
).startswith(f) and f not in excluded_formats):
return True
return False


Expand Down Expand Up @@ -884,6 +886,7 @@ def guess_datetime_format(
(('second',), '%S', 2),
(('microsecond',), '%f', 6),
(('second', 'microsecond'), '%S.%f', 0),
(('tzinfo',), '%z', 0),
(('tzinfo',), '%Z', 0),
]

Expand All @@ -904,6 +907,52 @@ def guess_datetime_format(
# that any user-provided function will not either.
tokens = dt_str_split(dt_str)

# Normalize offset part of tokens.
# There are multiple formats for the timezone offset.
# To pass the comparison condition between the output of `strftime` and
# joined tokens, which is carried out at the final step of the function,
# the offset part of the tokens must match the '%z' format like '+0900'
# instead of ‘+09:00’.
if parsed_datetime.tzinfo is not None:
if len(tokens) > 0 and tokens[-1] == 'Z':
# the last 'Z' means zero offset
tokens[-1] = '+0000'
else:
# If the input string has a timezone offset like '+0900',
# the offset is separated into two tokens, ex. ['+', '0900’].
# This separation will prevent subsequent processing
# from correctly parsing the time zone format.
# So in addition to the format nomalization, we rejoin them here.
if (
len(tokens) > 3
and tokens[-1].isdigit()
and tokens[-3].isdigit()
and tokens[-4] in ('+', '-')
):
# ex. [..., '+', '9', ':', '5'] -> [..., '+0905']
offset_idx = -4
sign, hour_offset, _, min_offset = tokens[offset_idx:]
tokens[offset_idx] = (
f'{sign}{int(hour_offset):02d}{int(min_offset):02d}'
)
tokens = tokens[:offset_idx + 1]
elif (
len(tokens) > 1
and tokens[-1].isdigit()
and tokens[-2] in ('+', '-')
):
# ex. [..., '+', '0905'] -> [..., '+0905']
offset_idx = -2
sign, offset = tokens[offset_idx:]
if len(offset) <= 2:
# '+09' -> '+0900'
tokens[offset_idx] = f'{sign}{int(offset):02d}00'
else:
tokens[offset_idx] = f'{sign}{int(offset):04d}'
tokens = tokens[:offset_idx + 1]

# else: Other patterns are tried to parse as a timezone name.

format_guess = [None] * len(tokens)
found_attrs = set()

Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1941,6 +1941,23 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset):
)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"ts,zero_tz,is_utc",
[
("2019-02-02 08:07:13", "Z", True),
("2019-02-02 08:07:13", "", False),
("2019-02-02 08:07:13.012345", "Z", True),
("2019-02-02 08:07:13.012345", "", False),
],
)
def test_infer_datetime_format_zero_tz(self, ts, zero_tz, is_utc):
# GH 41047
s = Series([ts + zero_tz])
result = to_datetime(s, infer_datetime_format=True)
tz = pytz.utc if is_utc else None
expected = Series([Timestamp(ts, tz=tz)])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("cache", [True, False])
def test_to_datetime_iso8601_noleading_0s(self, cache):
# GH 11871
Expand Down
50 changes: 50 additions & 0 deletions pandas/tests/tslibs/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,28 @@ def test_parsers_month_freq(date_str, expected):
("30-12-2011", "%d-%m-%Y"),
("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"),
("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"),
("2011-12-30T00:00:00UTC", "%Y-%m-%dT%H:%M:%S%Z"),
("2011-12-30T00:00:00Z", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+9", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+09", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+090", None),
("2011-12-30T00:00:00+0900", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00-0900", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+09:00", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+09:000", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+9:0", "%Y-%m-%dT%H:%M:%S%z"),
("2011-12-30T00:00:00+09:", None),
("2011-12-30T00:00:00.000000UTC", "%Y-%m-%dT%H:%M:%S.%f%Z"),
("2011-12-30T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+9", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+09", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+090", None),
("2011-12-30T00:00:00.000000+0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000-0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+09:00", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+09:000", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+09:", None),
("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"),
],
)
Expand Down Expand Up @@ -226,3 +248,31 @@ def test_parse_time_string_check_instance_type_raise_exception():
result = parse_time_string("2019")
expected = (datetime(2019, 1, 1), "year")
assert result == expected


@pytest.mark.parametrize(
"fmt,expected",
[
("%Y %m %d %H:%M:%S", True),
("%Y/%m/%d %H:%M:%S", True),
(r"%Y\%m\%d %H:%M:%S", True),
("%Y-%m-%d %H:%M:%S", True),
("%Y.%m.%d %H:%M:%S", True),
("%Y%m%d %H:%M:%S", True),
("%Y-%m-%dT%H:%M:%S", True),
("%Y-%m-%dT%H:%M:%S%z", True),
("%Y-%m-%dT%H:%M:%S%Z", True),
("%Y-%m-%dT%H:%M:%S.%f", True),
("%Y-%m-%dT%H:%M:%S.%f%z", True),
("%Y-%m-%dT%H:%M:%S.%f%Z", True),
("%Y%m%d", False),
("%Y%m", False),
("%Y", False),
("%Y-%m-%d", True),
("%Y-%m", True),
],
)
def test_is_iso_format(fmt, expected):
# see gh-41047
result = parsing.format_is_iso(fmt)
assert result == expected