Skip to content

Commit 7b1f9bf

Browse files
mroeschkejreback
authored andcommitted
ENH: Parse %z and %Z directive in format for to_datetime (pandas-dev#19979)
1 parent 36c1f6b commit 7b1f9bf

File tree

4 files changed

+183
-51
lines changed

4 files changed

+183
-51
lines changed

doc/source/whatsnew/v0.24.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ New features
1212

1313
Other Enhancements
1414
^^^^^^^^^^^^^^^^^^
15-
-
15+
- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
1616
-
1717
-
1818

pandas/_libs/tslibs/strptime.pyx

+86-47
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ except:
2020
except:
2121
from _dummy_thread import allocate_lock as _thread_allocate_lock
2222

23+
import pytz
2324

2425
from cython cimport Py_ssize_t
2526
from cpython cimport PyFloat_Check
@@ -40,6 +41,27 @@ from util cimport is_string_object
4041
from nattype cimport checknull_with_nat, NPY_NAT
4142
from nattype import nat_strings
4243

44+
cdef dict _parse_code_table = {'y': 0,
45+
'Y': 1,
46+
'm': 2,
47+
'B': 3,
48+
'b': 4,
49+
'd': 5,
50+
'H': 6,
51+
'I': 7,
52+
'M': 8,
53+
'S': 9,
54+
'f': 10,
55+
'A': 11,
56+
'a': 12,
57+
'w': 13,
58+
'j': 14,
59+
'U': 15,
60+
'W': 16,
61+
'Z': 17,
62+
'p': 18, # an additional key, only with I
63+
'z': 19}
64+
4365

4466
def array_strptime(ndarray[object] values, object fmt,
4567
bint exact=True, errors='raise'):
@@ -58,15 +80,15 @@ def array_strptime(ndarray[object] values, object fmt,
5880
Py_ssize_t i, n = len(values)
5981
pandas_datetimestruct dts
6082
ndarray[int64_t] iresult
61-
int year, month, day, minute, hour, second, weekday, julian, tz
62-
int week_of_year, week_of_year_start
83+
ndarray[object] result_timezone
84+
int year, month, day, minute, hour, second, weekday, julian
85+
int week_of_year, week_of_year_start, parse_code, ordinal
6386
int64_t us, ns
64-
object val, group_key, ampm, found
87+
object val, group_key, ampm, found, timezone
6588
dict found_key
6689
bint is_raise = errors=='raise'
6790
bint is_ignore = errors=='ignore'
6891
bint is_coerce = errors=='coerce'
69-
int ordinal
7092

7193
assert is_raise or is_ignore or is_coerce
7294

@@ -79,6 +101,8 @@ def array_strptime(ndarray[object] values, object fmt,
79101
in fmt):
80102
raise ValueError("Cannot use '%W' or '%U' without "
81103
"day and year")
104+
elif '%Z' in fmt and '%z' in fmt:
105+
raise ValueError("Cannot parse both %Z and %z")
82106

83107
global _TimeRE_cache, _regex_cache
84108
with _cache_lock:
@@ -108,32 +132,10 @@ def array_strptime(ndarray[object] values, object fmt,
108132

109133
result = np.empty(n, dtype='M8[ns]')
110134
iresult = result.view('i8')
135+
result_timezone = np.empty(n, dtype='object')
111136

112137
dts.us = dts.ps = dts.as = 0
113138

114-
cdef dict _parse_code_table = {
115-
'y': 0,
116-
'Y': 1,
117-
'm': 2,
118-
'B': 3,
119-
'b': 4,
120-
'd': 5,
121-
'H': 6,
122-
'I': 7,
123-
'M': 8,
124-
'S': 9,
125-
'f': 10,
126-
'A': 11,
127-
'a': 12,
128-
'w': 13,
129-
'j': 14,
130-
'U': 15,
131-
'W': 16,
132-
'Z': 17,
133-
'p': 18 # just an additional key, works only with I
134-
}
135-
cdef int parse_code
136-
137139
for i in range(n):
138140
val = values[i]
139141
if is_string_object(val):
@@ -176,7 +178,7 @@ def array_strptime(ndarray[object] values, object fmt,
176178
year = 1900
177179
month = day = 1
178180
hour = minute = second = ns = us = 0
179-
tz = -1
181+
timezone = None
180182
# Default to -1 to signify that values not known; not critical to have,
181183
# though
182184
week_of_year = -1
@@ -266,21 +268,10 @@ def array_strptime(ndarray[object] values, object fmt,
266268
# W starts week on Monday.
267269
week_of_year_start = 0
268270
elif parse_code == 17:
269-
# Since -1 is default value only need to worry about setting tz
270-
# if it can be something other than -1.
271-
found_zone = found_dict['Z'].lower()
272-
for value, tz_values in enumerate(locale_time.timezone):
273-
if found_zone in tz_values:
274-
# Deal w/ bad locale setup where timezone names are the
275-
# same and yet time.daylight is true; too ambiguous to
276-
# be able to tell what timezone has daylight savings
277-
if (time.tzname[0] == time.tzname[1] and
278-
time.daylight and found_zone not in (
279-
"utc", "gmt")):
280-
break
281-
else:
282-
tz = value
283-
break
271+
timezone = pytz.timezone(found_dict['Z'])
272+
elif parse_code == 19:
273+
timezone = parse_timezone_directive(found_dict['z'])
274+
284275
# If we know the wk of the year and what day of that wk, we can figure
285276
# out the Julian day of the year.
286277
if julian == -1 and week_of_year != -1 and weekday != -1:
@@ -330,7 +321,9 @@ def array_strptime(ndarray[object] values, object fmt,
330321
continue
331322
raise
332323

333-
return result
324+
result_timezone[i] = timezone
325+
326+
return result, result_timezone
334327

335328

336329
"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored
@@ -538,14 +531,13 @@ class TimeRE(dict):
538531
# XXX: Does 'Y' need to worry about having less or more than
539532
# 4 digits?
540533
'Y': r"(?P<Y>\d\d\d\d)",
534+
'z': r"(?P<z>[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|Z)",
541535
'A': self.__seqToRE(self.locale_time.f_weekday, 'A'),
542536
'a': self.__seqToRE(self.locale_time.a_weekday, 'a'),
543537
'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'),
544538
'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'),
545539
'p': self.__seqToRE(self.locale_time.am_pm, 'p'),
546-
'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone
547-
for tz in tz_names],
548-
'Z'),
540+
'Z': self.__seqToRE(pytz.all_timezones, 'Z'),
549541
'%': '%'})
550542
base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
551543
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
@@ -632,3 +624,50 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year,
632624
else:
633625
days_to_week = week_0_length + (7 * (week_of_year - 1))
634626
return 1 + days_to_week + day_of_week
627+
628+
cdef parse_timezone_directive(object z):
629+
"""
630+
Parse the '%z' directive and return a pytz.FixedOffset
631+
632+
Parameters
633+
----------
634+
z : string of the UTC offset
635+
636+
Returns
637+
-------
638+
pytz.FixedOffset
639+
640+
Notes
641+
-----
642+
This is essentially similar to the cpython implementation
643+
https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479
644+
"""
645+
646+
cdef:
647+
int gmtoff_fraction, hours, minutes, seconds, pad_number, microseconds
648+
int total_minutes
649+
object gmtoff_remainder, gmtoff_remainder_padding
650+
651+
if z == 'Z':
652+
return pytz.FixedOffset(0)
653+
if z[3] == ':':
654+
z = z[:3] + z[4:]
655+
if len(z) > 5:
656+
if z[5] != ':':
657+
msg = "Inconsistent use of : in {0}"
658+
raise ValueError(msg.format(z))
659+
z = z[:5] + z[6:]
660+
hours = int(z[1:3])
661+
minutes = int(z[3:5])
662+
seconds = int(z[5:7] or 0)
663+
664+
# Pad to always return microseconds.
665+
gmtoff_remainder = z[8:]
666+
pad_number = 6 - len(gmtoff_remainder)
667+
gmtoff_remainder_padding = "0" * pad_number
668+
microseconds = int(gmtoff_remainder + gmtoff_remainder_padding)
669+
670+
total_minutes = ((hours * 60) + minutes + (seconds / 60) +
671+
(microseconds / 60000000))
672+
total_minutes = -total_minutes if z.startswith("-") else total_minutes
673+
return pytz.FixedOffset(total_minutes)

pandas/core/tools/datetimes.py

+43-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from datetime import datetime, timedelta, time
2-
import numpy as np
32
from collections import MutableMapping
43

4+
import numpy as np
5+
56
from pandas._libs import tslib
67
from pandas._libs.tslibs.strptime import array_strptime
78
from pandas._libs.tslibs import parsing, conversion
@@ -27,6 +28,7 @@
2728
ABCDataFrame)
2829
from pandas.core.dtypes.missing import notna
2930
from pandas.core import algorithms
31+
from pandas.compat import zip
3032

3133

3234
def _guess_datetime_format_for_array(arr, **kwargs):
@@ -103,6 +105,41 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
103105
return result.values
104106

105107

108+
def _return_parsed_timezone_results(result, timezones, box, tz):
109+
"""
110+
Return results from array_strptime if a %z or %Z directive was passed.
111+
112+
Parameters
113+
----------
114+
result : ndarray
115+
int64 date representations of the dates
116+
timezones : ndarray
117+
pytz timezone objects
118+
box : boolean
119+
True boxes result as an Index-like, False returns an ndarray
120+
tz : object
121+
None or pytz timezone object
122+
Returns
123+
-------
124+
tz_result : ndarray of parsed dates with timezone
125+
Returns:
126+
127+
- Index-like if box=True
128+
- ndarray of Timestamps if box=False
129+
130+
"""
131+
if tz is not None:
132+
raise ValueError("Cannot pass a tz argument when "
133+
"parsing strings with timezone "
134+
"information.")
135+
tz_results = np.array([tslib.Timestamp(res).tz_localize(zone) for res, zone
136+
in zip(result, timezones)])
137+
if box:
138+
from pandas import Index
139+
return Index(tz_results)
140+
return tz_results
141+
142+
106143
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
107144
utc=None, box=True, format=None, exact=True,
108145
unit=None, infer_datetime_format=False, origin='unix',
@@ -343,8 +380,11 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
343380
# fallback
344381
if result is None:
345382
try:
346-
result = array_strptime(arg, format, exact=exact,
347-
errors=errors)
383+
result, timezones = array_strptime(
384+
arg, format, exact=exact, errors=errors)
385+
if '%Z' in format or '%z' in format:
386+
return _return_parsed_timezone_results(
387+
result, timezones, box, tz)
348388
except tslib.OutOfBoundsDatetime:
349389
if errors == 'raise':
350390
raise

pandas/tests/indexes/datetimes/test_tools.py

+53
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,59 @@ def test_to_datetime_format_weeks(self, cache):
179179
for s, format, dt in data:
180180
assert to_datetime(s, format=format, cache=cache) == dt
181181

182+
@pytest.mark.parametrize("box,const,assert_equal", [
183+
[True, pd.Index, 'assert_index_equal'],
184+
[False, np.array, 'assert_numpy_array_equal']])
185+
@pytest.mark.parametrize("fmt,dates,expected_dates", [
186+
['%Y-%m-%d %H:%M:%S %Z',
187+
['2010-01-01 12:00:00 UTC'] * 2,
188+
[pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2],
189+
['%Y-%m-%d %H:%M:%S %Z',
190+
['2010-01-01 12:00:00 UTC',
191+
'2010-01-01 12:00:00 GMT',
192+
'2010-01-01 12:00:00 US/Pacific'],
193+
[pd.Timestamp('2010-01-01 12:00:00', tz='UTC'),
194+
pd.Timestamp('2010-01-01 12:00:00', tz='GMT'),
195+
pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]],
196+
['%Y-%m-%d %H:%M:%S%z',
197+
['2010-01-01 12:00:00+0100'] * 2,
198+
[pd.Timestamp('2010-01-01 12:00:00',
199+
tzinfo=pytz.FixedOffset(60))] * 2],
200+
['%Y-%m-%d %H:%M:%S %z',
201+
['2010-01-01 12:00:00 +0100'] * 2,
202+
[pd.Timestamp('2010-01-01 12:00:00',
203+
tzinfo=pytz.FixedOffset(60))] * 2],
204+
['%Y-%m-%d %H:%M:%S %z',
205+
['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'],
206+
[pd.Timestamp('2010-01-01 12:00:00',
207+
tzinfo=pytz.FixedOffset(60)),
208+
pd.Timestamp('2010-01-01 12:00:00',
209+
tzinfo=pytz.FixedOffset(-60))]],
210+
['%Y-%m-%d %H:%M:%S %z',
211+
['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'],
212+
[pd.Timestamp('2010-01-01 12:00:00',
213+
tzinfo=pytz.FixedOffset(0)), # pytz coerces to UTC
214+
pd.Timestamp('2010-01-01 12:00:00',
215+
tzinfo=pytz.FixedOffset(0))]]])
216+
def test_to_datetime_parse_tzname_or_tzoffset(self, box, const,
217+
assert_equal, fmt,
218+
dates, expected_dates):
219+
# GH 13486
220+
result = pd.to_datetime(dates, format=fmt, box=box)
221+
expected = const(expected_dates)
222+
getattr(tm, assert_equal)(result, expected)
223+
224+
with pytest.raises(ValueError):
225+
pd.to_datetime(dates, format=fmt, box=box, utc=True)
226+
227+
@pytest.mark.parametrize('offset', [
228+
'+0', '-1foo', 'UTCbar', ':10', '+01:000:01', ''])
229+
def test_to_datetime_parse_timezone_malformed(self, offset):
230+
fmt = '%Y-%m-%d %H:%M:%S %z'
231+
date = '2010-01-01 12:00:00 ' + offset
232+
with pytest.raises(ValueError):
233+
pd.to_datetime([date], format=fmt)
234+
182235

183236
class TestToDatetime(object):
184237
def test_to_datetime_pydatetime(self):

0 commit comments

Comments
 (0)