-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Parse %z and %Z directive in format for to_datetime #19979
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 32 commits
4a43815
cb47c08
f299aec
259ec8f
77af4db
54c2491
0e2a0cd
d31e141
7bdbdf4
3e3d5c6
c16ef8c
6f0b7f0
0525823
4c22808
24e1c0a
4f2f865
145e5da
64bc3fc
47a9d69
1b44554
0dcc59f
149781b
d99ef5a
0e5e3c6
9a2ea19
924859e
a1599a0
6c80c2e
abccc3e
473a0f4
ab0a692
56fc683
85bd45e
eb2a661
5500ca8
0e0d0fd
34f638c
757458d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
from datetime import datetime, timedelta, time | ||
import numpy as np | ||
from collections import MutableMapping | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import tslib | ||
from pandas._libs.tslibs.strptime import array_strptime | ||
from pandas._libs.tslibs import parsing, conversion | ||
|
@@ -27,6 +28,7 @@ | |
ABCDataFrame) | ||
from pandas.core.dtypes.missing import notna | ||
from pandas.core import algorithms | ||
from pandas.compat import zip | ||
|
||
|
||
def _guess_datetime_format_for_array(arr, **kwargs): | ||
|
@@ -103,6 +105,41 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None): | |
return result.values | ||
|
||
|
||
def _return_parsed_timezone_results(result, timezones, box, tz): | ||
""" | ||
Return results from array_strptime if a %z or %Z directive was passed. | ||
|
||
Parameters | ||
---------- | ||
result : ndarray | ||
int64 date representations of the dates | ||
timezones : ndarray | ||
pytz timezone objects | ||
box : boolean | ||
True boxes result as an Index-like, False returns an ndarray | ||
tz : object | ||
None or pytz timezone object | ||
Returns | ||
------- | ||
tz_result : ndarray of parsed dates with timezone | ||
Returns: | ||
|
||
- Index-like if box=True | ||
- ndarray of Timestamps if box=False | ||
|
||
""" | ||
if tz is not None: | ||
raise ValueError("Cannot pass a tz argument when " | ||
"parsing strings with timezone " | ||
"information.") | ||
tz_results = np.array([tslib.Timestamp(res).tz_localize(zone) for res, zone | ||
in zip(result, timezones)]) | ||
if box: | ||
from pandas import Index | ||
return Index(tz_results) | ||
return tz_results | ||
|
||
|
||
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, | ||
utc=None, box=True, format=None, exact=True, | ||
unit=None, infer_datetime_format=False, origin='unix', | ||
|
@@ -343,8 +380,11 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): | |
# fallback | ||
if result is None: | ||
try: | ||
result = array_strptime(arg, format, exact=exact, | ||
errors=errors) | ||
result, timezones = array_strptime( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would much rather do the error handling in the _return_parsed_timezone_results. This block is just very complicated and hard to grok |
||
arg, format, exact=exact, errors=errors) | ||
if '%Z' in format or '%z' in format: | ||
return _return_parsed_timezone_results( | ||
result, timezones, box, tz) | ||
except tslib.OutOfBoundsDatetime: | ||
if errors == 'raise': | ||
raise | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -179,6 +179,55 @@ def test_to_datetime_format_weeks(self, cache): | |
for s, format, dt in data: | ||
assert to_datetime(s, format=format, cache=cache) == dt | ||
|
||
@pytest.mark.parametrize("box,const,assert_equal", [ | ||
[True, pd.Index, 'assert_index_equal'], | ||
[False, np.array, 'assert_numpy_array_equal']]) | ||
@pytest.mark.parametrize("fmt,dates,expected_dates", [ | ||
['%Y-%m-%d %H:%M:%S %Z', | ||
['2010-01-01 12:00:00 UTC'] * 2, | ||
[pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2], | ||
['%Y-%m-%d %H:%M:%S %Z', | ||
['2010-01-01 12:00:00 UTC', | ||
'2010-01-01 12:00:00 GMT', | ||
'2010-01-01 12:00:00 US/Pacific'], | ||
[pd.Timestamp('2010-01-01 12:00:00', tz='UTC'), | ||
pd.Timestamp('2010-01-01 12:00:00', tz='GMT'), | ||
pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]], | ||
['%Y-%m-%d %H:%M:%S %z', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you one of them, eg this one, without the space before the tz? |
||
['2010-01-01 12:00:00 +0100'] * 2, | ||
[pd.Timestamp('2010-01-01 12:00:00', | ||
tzinfo=pytz.FixedOffset(60))] * 2], | ||
['%Y-%m-%d %H:%M:%S %z', | ||
['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'], | ||
[pd.Timestamp('2010-01-01 12:00:00', | ||
tzinfo=pytz.FixedOffset(60)), | ||
pd.Timestamp('2010-01-01 12:00:00', | ||
tzinfo=pytz.FixedOffset(-60))]], | ||
['%Y-%m-%d %H:%M:%S %z', | ||
['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this also work with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The regex I pulled from https://github.com/python/cpython/blob/master/Lib/_strptime.py has an option for 'Z' with But There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK (that's probably a newer addition to python), then it makes sense to follow upstream python to be consistent |
||
[pd.Timestamp('2010-01-01 12:00:00', | ||
tzinfo=pytz.FixedOffset(0)), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be UTC or a fixed offset of 0 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pytz coerces a fixed offset of 0 to UTC
But making it explicit here that %z should return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the actual DatetimeIndex you get here has UTC timezone? OK, that's good! (but maybe add a small comment since I would not expect that) |
||
pd.Timestamp('2010-01-01 12:00:00', | ||
tzinfo=pytz.FixedOffset(0))]]]) | ||
def test_to_datetime_parse_tzname_or_tzoffset(self, box, const, | ||
assert_equal, fmt, | ||
dates, expected_dates): | ||
# GH 13486 | ||
result = pd.to_datetime(dates, format=fmt, box=box) | ||
expected = const(expected_dates) | ||
getattr(tm, assert_equal)(result, expected) | ||
|
||
with pytest.raises(ValueError): | ||
pd.to_datetime(dates, format=fmt, box=box, utc=True) | ||
|
||
@pytest.mark.parametrize('offset', [ | ||
'+0', '-1foo', 'UTCbar', ':10', '+01:000:01']) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add an empty string here as well? |
||
def test_to_datetime_parse_timezone_malformed(self, offset): | ||
fmt = '%Y-%m-%d %H:%M:%S %z' | ||
date = '2010-01-01 12:00:00 ' + offset | ||
with pytest.raises(ValueError): | ||
pd.to_datetime([date], format=fmt) | ||
|
||
|
||
class TestToDatetime(object): | ||
def test_to_datetime_pydatetime(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you move this whole parse to a function and just all it here (and return the values as a tuple)