-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Parse %z and %Z directive in format for to_datetime #19979
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 27 commits
4a43815
cb47c08
f299aec
259ec8f
77af4db
54c2491
0e2a0cd
d31e141
7bdbdf4
3e3d5c6
c16ef8c
6f0b7f0
0525823
4c22808
24e1c0a
4f2f865
145e5da
64bc3fc
47a9d69
1b44554
0dcc59f
149781b
d99ef5a
0e5e3c6
9a2ea19
924859e
a1599a0
6c80c2e
abccc3e
473a0f4
ab0a692
56fc683
85bd45e
eb2a661
5500ca8
0e0d0fd
34f638c
757458d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ except: | |
except: | ||
from _dummy_thread import allocate_lock as _thread_allocate_lock | ||
|
||
import pytz | ||
|
||
from cython cimport Py_ssize_t | ||
from cpython cimport PyFloat_Check | ||
|
@@ -58,15 +59,36 @@ def array_strptime(ndarray[object] values, object fmt, | |
Py_ssize_t i, n = len(values) | ||
pandas_datetimestruct dts | ||
ndarray[int64_t] iresult | ||
int year, month, day, minute, hour, second, weekday, julian, tz | ||
int week_of_year, week_of_year_start | ||
ndarray[object] result_timezone | ||
int year, month, day, minute, hour, second, weekday, julian | ||
int week_of_year, week_of_year_start, parse_code | ||
int64_t us, ns | ||
object val, group_key, ampm, found | ||
object val, group_key, ampm, found, timezone | ||
dict found_key | ||
bint is_raise = errors=='raise' | ||
bint is_ignore = errors=='ignore' | ||
bint is_coerce = errors=='coerce' | ||
int ordinal | ||
dict _parse_code_table = {'y': 0, | ||
'Y': 1, | ||
'm': 2, | ||
'B': 3, | ||
'b': 4, | ||
'd': 5, | ||
'H': 6, | ||
'I': 7, | ||
'M': 8, | ||
'S': 9, | ||
'f': 10, | ||
'A': 11, | ||
'a': 12, | ||
'w': 13, | ||
'j': 14, | ||
'U': 15, | ||
'W': 16, | ||
'Z': 17, | ||
'p': 18, # an additional key, only with I | ||
'z': 19} | ||
|
||
assert is_raise or is_ignore or is_coerce | ||
|
||
|
@@ -108,32 +130,10 @@ def array_strptime(ndarray[object] values, object fmt, | |
|
||
result = np.empty(n, dtype='M8[ns]') | ||
iresult = result.view('i8') | ||
result_timezone = np.empty(n, dtype='object') | ||
|
||
dts.us = dts.ps = dts.as = 0 | ||
|
||
cdef dict _parse_code_table = { | ||
'y': 0, | ||
'Y': 1, | ||
'm': 2, | ||
'B': 3, | ||
'b': 4, | ||
'd': 5, | ||
'H': 6, | ||
'I': 7, | ||
'M': 8, | ||
'S': 9, | ||
'f': 10, | ||
'A': 11, | ||
'a': 12, | ||
'w': 13, | ||
'j': 14, | ||
'U': 15, | ||
'W': 16, | ||
'Z': 17, | ||
'p': 18 # just an additional key, works only with I | ||
} | ||
cdef int parse_code | ||
|
||
for i in range(n): | ||
val = values[i] | ||
if is_string_object(val): | ||
|
@@ -176,7 +176,7 @@ def array_strptime(ndarray[object] values, object fmt, | |
year = 1900 | ||
month = day = 1 | ||
hour = minute = second = ns = us = 0 | ||
tz = -1 | ||
timezone = None | ||
# Default to -1 to signify that values not known; not critical to have, | ||
# though | ||
week_of_year = -1 | ||
|
@@ -266,21 +266,10 @@ def array_strptime(ndarray[object] values, object fmt, | |
# W starts week on Monday. | ||
week_of_year_start = 0 | ||
elif parse_code == 17: | ||
# Since -1 is default value only need to worry about setting tz | ||
# if it can be something other than -1. | ||
found_zone = found_dict['Z'].lower() | ||
for value, tz_values in enumerate(locale_time.timezone): | ||
if found_zone in tz_values: | ||
# Deal w/ bad locale setup where timezone names are the | ||
# same and yet time.daylight is true; too ambiguous to | ||
# be able to tell what timezone has daylight savings | ||
if (time.tzname[0] == time.tzname[1] and | ||
time.daylight and found_zone not in ( | ||
"utc", "gmt")): | ||
break | ||
else: | ||
tz = value | ||
break | ||
timezone = pytz.timezone(found_dict['Z']) | ||
elif parse_code == 19: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you move this whole parse to a function and just all it here (and return the values as a tuple) |
||
timezone = parse_timezone_directive(found_dict['z']) | ||
|
||
# If we know the wk of the year and what day of that wk, we can figure | ||
# out the Julian day of the year. | ||
if julian == -1 and week_of_year != -1 and weekday != -1: | ||
|
@@ -330,7 +319,9 @@ def array_strptime(ndarray[object] values, object fmt, | |
continue | ||
raise | ||
|
||
return result | ||
result_timezone[i] = timezone | ||
|
||
return result, result_timezone | ||
|
||
|
||
"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored | ||
|
@@ -538,14 +529,13 @@ class TimeRE(dict): | |
# XXX: Does 'Y' need to worry about having less or more than | ||
# 4 digits? | ||
'Y': r"(?P<Y>\d\d\d\d)", | ||
'z': r"(?P<z>[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|Z)", | ||
'A': self.__seqToRE(self.locale_time.f_weekday, 'A'), | ||
'a': self.__seqToRE(self.locale_time.a_weekday, 'a'), | ||
'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), | ||
'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), | ||
'p': self.__seqToRE(self.locale_time.am_pm, 'p'), | ||
'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone | ||
for tz in tz_names], | ||
'Z'), | ||
'Z': self.__seqToRE(pytz.all_timezones, 'Z'), | ||
'%': '%'}) | ||
base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) | ||
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) | ||
|
@@ -632,3 +622,50 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year, | |
else: | ||
days_to_week = week_0_length + (7 * (week_of_year - 1)) | ||
return 1 + days_to_week + day_of_week | ||
|
||
cdef parse_timezone_directive(object z): | ||
""" | ||
Parse the '%z' directive and return a pytz.FixedOffset | ||
|
||
Parameters | ||
---------- | ||
z : string of the UTC offset | ||
|
||
Returns | ||
------- | ||
pytz.FixedOffset | ||
|
||
Notes | ||
----- | ||
This is essentially similar to the cpython implementation | ||
https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479 | ||
""" | ||
|
||
cdef: | ||
int gmtoff_fraction, hours, minutes, seconds, pad_number, microseconds | ||
int total_minutes | ||
object gmtoff_remainder, gmtoff_remainder_padding | ||
|
||
if z == 'Z': | ||
return pytz.FixedOffset(0) | ||
if z[3] == ':': | ||
z = z[:3] + z[4:] | ||
if len(z) > 5: | ||
if z[5] != ':': | ||
msg = "Inconsistent use of : in {0}" | ||
raise ValueError(msg.format(z)) | ||
z = z[:5] + z[6:] | ||
hours = int(z[1:3]) | ||
minutes = int(z[3:5]) | ||
seconds = int(z[5:7] or 0) | ||
|
||
# Pad to always return microseconds. | ||
gmtoff_remainder = z[8:] | ||
pad_number = 6 - len(gmtoff_remainder) | ||
gmtoff_remainder_padding = "0" * pad_number | ||
microseconds = int(gmtoff_remainder + gmtoff_remainder_padding) | ||
|
||
total_minutes = ((hours * 60) + minutes + (seconds / 60) + | ||
(microseconds / 60000000)) | ||
total_minutes = -total_minutes if z.startswith("-") else total_minutes | ||
return pytz.FixedOffset(total_minutes) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
from datetime import datetime, timedelta, time | ||
import numpy as np | ||
from collections import MutableMapping | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import tslib | ||
from pandas._libs.tslibs.strptime import array_strptime | ||
from pandas._libs.tslibs import parsing, conversion | ||
|
@@ -27,6 +28,7 @@ | |
ABCDataFrame) | ||
from pandas.core.dtypes.missing import notna | ||
from pandas.core import algorithms | ||
from pandas.compat import zip | ||
|
||
|
||
def _guess_datetime_format_for_array(arr, **kwargs): | ||
|
@@ -103,6 +105,36 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None): | |
return result.values | ||
|
||
|
||
def _return_parsed_timezone_results(result, timezones, box): | ||
""" | ||
Return results from array_strptime if a %z or %Z directive was passed. | ||
|
||
Parameters | ||
---------- | ||
result : ndarray | ||
int64 date representations of the dates | ||
timezones : ndarray | ||
pytz timezone objects | ||
box : boolean | ||
True boxes result as an Index-like, False returns an ndarray | ||
|
||
Returns | ||
------- | ||
tz_result : ndarray of parsed dates with timezone | ||
Returns: | ||
|
||
- Index-like if box=True | ||
- ndarray of Timestamps if box=False | ||
|
||
""" | ||
tz_results = np.array([tslib.Timestamp(res).tz_localize(tz) for res, tz | ||
in zip(result, timezones)]) | ||
if box: | ||
from pandas import Index | ||
return Index(tz_results) | ||
return tz_results | ||
|
||
|
||
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, | ||
utc=None, box=True, format=None, exact=True, | ||
unit=None, infer_datetime_format=False, origin='unix', | ||
|
@@ -343,8 +375,20 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): | |
# fallback | ||
if result is None: | ||
try: | ||
result = array_strptime(arg, format, exact=exact, | ||
errors=errors) | ||
parsing_tzname = '%Z' in format | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. woa, what do you need all this for??? |
||
parsing_tzoffset = '%z' in format | ||
if parsing_tzoffset and parsing_tzname: | ||
raise ValueError("Cannot parse both %Z and %z") | ||
elif tz is not None and (parsing_tzname or | ||
parsing_tzoffset): | ||
raise ValueError("Cannot pass a tz argument when " | ||
"parsing strings with timezone " | ||
"information.") | ||
result, timezones = array_strptime( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would much rather do the error handling in the _return_parsed_timezone_results. This block is just very complicated and hard to grok |
||
arg, format, exact=exact, errors=errors) | ||
if parsing_tzname or parsing_tzoffset: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do you need the parsing_tzoffset & parsing_tzname, won't tznames and tzoffsets be not None in this case? (and None if these are not found)? |
||
return _return_parsed_timezone_results( | ||
result, timezones, box) | ||
except tslib.OutOfBoundsDatetime: | ||
if errors == 'raise': | ||
raise | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -179,6 +179,55 @@ def test_to_datetime_format_weeks(self, cache): | |
for s, format, dt in data: | ||
assert to_datetime(s, format=format, cache=cache) == dt | ||
|
||
@pytest.mark.parametrize("box,const,assert_equal", [ | ||
[True, pd.Index, 'assert_index_equal'], | ||
[False, np.array, 'assert_numpy_array_equal']]) | ||
@pytest.mark.parametrize("fmt,dates,expected_dates", [ | ||
['%Y-%m-%d %H:%M:%S %Z', | ||
['2010-01-01 12:00:00 UTC'] * 2, | ||
[pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2], | ||
['%Y-%m-%d %H:%M:%S %Z', | ||
['2010-01-01 12:00:00 UTC', | ||
'2010-01-01 12:00:00 GMT', | ||
'2010-01-01 12:00:00 US/Pacific'], | ||
[pd.Timestamp('2010-01-01 12:00:00', tz='UTC'), | ||
pd.Timestamp('2010-01-01 12:00:00', tz='GMT'), | ||
pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]], | ||
['%Y-%m-%d %H:%M:%S %z', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you one of them, eg this one, without the space before the tz? |
||
['2010-01-01 12:00:00 +0100'] * 2, | ||
[pd.Timestamp('2010-01-01 12:00:00', | ||
tzinfo=pytz.FixedOffset(60))] * 2], | ||
['%Y-%m-%d %H:%M:%S %z', | ||
['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'], | ||
[pd.Timestamp('2010-01-01 12:00:00', | ||
tzinfo=pytz.FixedOffset(60)), | ||
pd.Timestamp('2010-01-01 12:00:00', | ||
tzinfo=pytz.FixedOffset(-60))]], | ||
['%Y-%m-%d %H:%M:%S %z', | ||
['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this also work with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The regex I pulled from https://github.com/python/cpython/blob/master/Lib/_strptime.py has an option for 'Z' with But There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK (that's probably a newer addition to python), then it makes sense to follow upstream python to be consistent |
||
[pd.Timestamp('2010-01-01 12:00:00', | ||
tzinfo=pytz.FixedOffset(0)), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be UTC or a fixed offset of 0 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pytz coerces a fixed offset of 0 to UTC
But making it explicit here that %z should return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the actual DatetimeIndex you get here has UTC timezone? OK, that's good! (but maybe add a small comment since I would not expect that) |
||
pd.Timestamp('2010-01-01 12:00:00', | ||
tzinfo=pytz.FixedOffset(0))]]]) | ||
def test_to_datetime_parse_tzname_or_tzoffset(self, box, const, | ||
assert_equal, fmt, | ||
dates, expected_dates): | ||
# GH 13486 | ||
result = pd.to_datetime(dates, format=fmt, box=box) | ||
expected = const(expected_dates) | ||
getattr(tm, assert_equal)(result, expected) | ||
|
||
with pytest.raises(ValueError): | ||
pd.to_datetime(dates, format=fmt, box=box, utc=True) | ||
|
||
@pytest.mark.parametrize('offset', [ | ||
'+0', '-1foo', 'UTCbar', ':10', '+01:000:01']) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add an empty string here as well? |
||
def test_to_datetime_parse_timezone_malformed(self, offset): | ||
fmt = '%Y-%m-%d %H:%M:%S %z' | ||
date = '2010-01-01 12:00:00 ' + offset | ||
with pytest.raises(ValueError): | ||
pd.to_datetime([date], format=fmt) | ||
|
||
|
||
class TestToDatetime(object): | ||
def test_to_datetime_pydatetime(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you could make this a module level variable