-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: pd.to_datetime(infer_datetime_format=True) drops timezone #42068
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
2b6e9f6
09a5108
1886b22
309b5d9
7198d51
f42be84
44dbf67
a031e39
8033fe7
267b971
a5136ed
0538167
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -822,15 +822,17 @@ def format_is_iso(f: str) -> bint: | |
Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different | ||
but must be consistent. Leading 0s in dates and times are optional. | ||
""" | ||
iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format | ||
iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}'.format | ||
excluded_formats = ['%Y%m%d', '%Y%m', '%Y'] | ||
|
||
for date_sep in [' ', '/', '\\', '-', '.', '']: | ||
for time_sep in [' ', 'T']: | ||
if (iso_template(date_sep=date_sep, | ||
time_sep=time_sep | ||
).startswith(f) and f not in excluded_formats): | ||
return True | ||
for micro_or_tz in ['', '%z', '%Z', '.%f', '.%f%z', '.%f%Z']: | ||
if (iso_template(date_sep=date_sep, | ||
time_sep=time_sep, | ||
micro_or_tz=micro_or_tz, | ||
).startswith(f) and f not in excluded_formats): | ||
return True | ||
return False | ||
|
||
|
||
|
@@ -884,6 +886,7 @@ def guess_datetime_format( | |
(('second',), '%S', 2), | ||
(('microsecond',), '%f', 6), | ||
(('second', 'microsecond'), '%S.%f', 0), | ||
(('tzinfo',), '%z', 0), | ||
(('tzinfo',), '%Z', 0), | ||
] | ||
|
||
|
@@ -904,6 +907,53 @@ def guess_datetime_format( | |
# that any user-provided function will not either. | ||
tokens = dt_str_split(dt_str) | ||
|
||
# Normalize offset part of tokens. | ||
# There are multiple formats for the timezone offset. | ||
# To pass the comparison condition between the output of `strftime` and | ||
# joined tokens, which is carried out at the final step of the function, | ||
# the offset part of the tokens must match the '%z' format like '+0900' | ||
# instead of ‘+09:00’. | ||
if (parsed_datetime.tzinfo is not None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Don't need the parenthesis around this if statement (and most of the other ones you have here). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, redundant parentheses are removed. |
||
if (len(tokens) > 0) and (tokens[-1] == 'Z'): | ||
# the last 'Z' means zero offset | ||
tokens[-1] = '+0000' | ||
else: | ||
# If the input string has a timezone offset like '+0900', | ||
mroeschke marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# the offset is separated into two tokens, ex. ['+', '0900’]. | ||
# This separation will prevent subsequent processing | ||
# from correctly parsing the time zone format. | ||
# So in addition to the format nomalization, we rejoin them here. | ||
if ( | ||
(len(tokens) > 3) | ||
and tokens[-1].isdigit() | ||
and (tokens[-2] == ':') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be safe, I think this should be any non-digit separator just in case some format using something different than a colon There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I skipped the check about the separator string. So no assumptions are made about the separator type. |
||
and tokens[-3].isdigit() | ||
and (tokens[-4] in ('+', '-')) | ||
): | ||
# ex. [..., '+', '9', ':', '5'] -> [..., '+0905'] | ||
offset_idx = -4 | ||
sign, hour_offset, _, min_offset = tokens[offset_idx:] | ||
tokens[offset_idx] = ( | ||
f'{sign}{int(hour_offset):02d}{int(min_offset):02d}' | ||
) | ||
tokens = tokens[:offset_idx + 1] | ||
elif ( | ||
(len(tokens) > 1) | ||
and tokens[-1].isdigit() | ||
and (tokens[-2] in ('+', '-')) | ||
): | ||
# ex. [..., '+', '0905'] -> [..., '+0905'] | ||
offset_idx = -2 | ||
sign, offset = tokens[offset_idx:] | ||
if len(offset) <= 2: | ||
# '+09' -> '+0900' | ||
tokens[offset_idx] = f'{sign}{int(offset):02d}00' | ||
else: | ||
tokens[offset_idx] = f'{sign}{int(offset):04d}' | ||
tokens = tokens[:offset_idx + 1] | ||
|
||
# else: Other patterns are tried to parse as a timezone name. | ||
|
||
format_guess = [None] * len(tokens) | ||
found_attrs = set() | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry for the delay. This PR didn't make it into 1.3. Could you move this to 1.4.0 whatsnew?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@mroeschke OK!
Should I merge the current master and then fix the whatsnew file?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes that would be great, thank you.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed whatsnew entry.