Skip to content

Commit 94641dc

Browse files
committed
ENH: Parse %z directive in format for to_datetime
return timedeltas as list return timedeltas in a numpy array some flake fixes Extend logic of parsing timezones address comment misspelling Add additional tests address timezone localization
1 parent 31afaf8 commit 94641dc

File tree

3 files changed

+178
-7
lines changed

3 files changed

+178
-7
lines changed

pandas/_libs/tslibs/strptime.pyx

+48-3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ except:
2020
except:
2121
from _dummy_thread import allocate_lock as _thread_allocate_lock
2222

23+
import pytz
2324

2425
from cython cimport Py_ssize_t
2526
from cpython cimport PyFloat_Check
@@ -29,7 +30,7 @@ cimport cython
2930
import numpy as np
3031
from numpy cimport ndarray, int64_t
3132

32-
from datetime import date as datetime_date
33+
from datetime import date as datetime_date, timedelta as datetime_timedelta
3334
from cpython.datetime cimport datetime
3435

3536
from np_datetime cimport (check_dts_bounds,
@@ -58,6 +59,8 @@ def array_strptime(ndarray[object] values, object fmt,
5859
Py_ssize_t i, n = len(values)
5960
pandas_datetimestruct dts
6061
ndarray[int64_t] iresult
62+
ndarray[object] results_tzoffset
63+
ndarray[object] results_tzname
6164
int year, month, day, minute, hour, second, weekday, julian, tz
6265
int week_of_year, week_of_year_start
6366
int64_t us, ns
@@ -109,6 +112,9 @@ def array_strptime(ndarray[object] values, object fmt,
109112
result = np.empty(n, dtype='M8[ns]')
110113
iresult = result.view('i8')
111114

115+
results_tzname = np.empty(n, dtype='object')
116+
results_tzoffset = np.empty(n, dtype='object')
117+
112118
dts.us = dts.ps = dts.as = 0
113119

114120
cdef dict _parse_code_table = {
@@ -130,7 +136,8 @@ def array_strptime(ndarray[object] values, object fmt,
130136
'U': 15,
131137
'W': 16,
132138
'Z': 17,
133-
'p': 18 # just an additional key, works only with I
139+
'p': 18, # just an additional key, works only with I
140+
'z': 19,
134141
}
135142
cdef int parse_code
136143

@@ -177,6 +184,8 @@ def array_strptime(ndarray[object] values, object fmt,
177184
month = day = 1
178185
hour = minute = second = ns = us = 0
179186
tz = -1
187+
gmtoff = None
188+
gmtoff_fraction = 0
180189
# Default to -1 to signify that values not known; not critical to have,
181190
# though
182191
week_of_year = -1
@@ -281,6 +290,32 @@ def array_strptime(ndarray[object] values, object fmt,
281290
else:
282291
tz = value
283292
break
293+
elif parse_code == 19:
294+
z = found_dict['z']
295+
if z == 'Z':
296+
gmtoff = 0
297+
else:
298+
if z[3] == ':':
299+
z = z[:3] + z[4:]
300+
if len(z) > 5:
301+
if z[5] != ':':
302+
msg = "Inconsistent use of : in {0}"
303+
raise ValueError(msg.format(found_dict['z']))
304+
z = z[:5] + z[6:]
305+
hours = int(z[1:3])
306+
minutes = int(z[3:5])
307+
seconds = int(z[5:7] or 0)
308+
gmtoff = (hours * 60 * 60) + (minutes * 60) + seconds
309+
gmtoff_remainder = z[8:]
310+
# Pad to always return microseconds.
311+
pad_number = 6 - len(gmtoff_remainder)
312+
gmtoff_remainder_padding = "0" * pad_number
313+
gmtoff_fraction = int(gmtoff_remainder +
314+
gmtoff_remainder_padding)
315+
if z.startswith("-"):
316+
gmtoff = -gmtoff
317+
gmtoff_fraction = -gmtoff_fraction
318+
284319
# If we know the wk of the year and what day of that wk, we can figure
285320
# out the Julian day of the year.
286321
if julian == -1 and week_of_year != -1 and weekday != -1:
@@ -330,7 +365,16 @@ def array_strptime(ndarray[object] values, object fmt,
330365
continue
331366
raise
332367

333-
return result
368+
tzname = found_dict.get('Z')
369+
if tzname is not None:
370+
results_tzname[i] = tzname
371+
372+
if gmtoff is not None:
373+
tzdelta = datetime_timedelta(seconds=gmtoff,
374+
microseconds=gmtoff_fraction)
375+
results_tzoffset[i] = tzdelta
376+
377+
return result, results_tzname, results_tzoffset
334378

335379

336380
"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored
@@ -538,6 +582,7 @@ class TimeRE(dict):
538582
# XXX: Does 'Y' need to worry about having less or more than
539583
# 4 digits?
540584
'Y': r"(?P<Y>\d\d\d\d)",
585+
'z': r"(?P<z>[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|Z)",
541586
'A': self.__seqToRE(self.locale_time.f_weekday, 'A'),
542587
'a': self.__seqToRE(self.locale_time.a_weekday, 'a'),
543588
'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'),

pandas/core/tools/datetimes.py

+72-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from datetime import datetime, timedelta, time
2-
import numpy as np
32
from collections import MutableMapping
43

4+
import numpy as np
5+
import pytz
6+
57
from pandas._libs import tslib
68
from pandas._libs.tslibs.strptime import array_strptime
79
from pandas._libs.tslibs import parsing, conversion
@@ -27,6 +29,7 @@
2729
ABCDataFrame)
2830
from pandas.core.dtypes.missing import notna
2931
from pandas.core import algorithms
32+
from pandas.compat import PY3, zip
3033

3134

3235
def _guess_datetime_format_for_array(arr, **kwargs):
@@ -343,8 +346,74 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
343346
# fallback
344347
if result is None:
345348
try:
346-
result = array_strptime(arg, format, exact=exact,
347-
errors=errors)
349+
parsing_tzname = '%Z' in format
350+
parsing_tzoffset = '%z' in format
351+
if tz is not None and (parsing_tzname or
352+
parsing_tzoffset):
353+
raise ValueError("Cannot pass a tz argument when "
354+
"parsing strings with timezone "
355+
"information.")
356+
result, tznames, tzoffsets = array_strptime(
357+
arg, format, exact=exact, errors=errors)
358+
if parsing_tzname and not parsing_tzoffset:
359+
if len(set(tznames)) == 1:
360+
tz = tznames[0]
361+
if box:
362+
result = DatetimeIndex(result,
363+
tz=tz,
364+
name=name)
365+
else:
366+
stamps = [tslib.Timestamp(res, tz=tz)
367+
for res in result]
368+
result = np.array(stamps, dtype=object)
369+
else:
370+
stamps = [tslib.Timestamp(res, tz=tz)
371+
for res, tz in zip(result, tznames)]
372+
result = np.array(stamps, dtype=object)
373+
return result
374+
elif parsing_tzoffset and not parsing_tzname:
375+
# Should we convert these to pytz.FixedOffsets
376+
# or datetime.timezones?
377+
if len(set(tzoffsets)) == 1:
378+
offset_mins = tzoffsets[0].total_seconds() / 60
379+
tzoffset = pytz.FixedOffset(offset_mins)
380+
if box:
381+
result = DatetimeIndex(result,
382+
tz=tzoffset,
383+
name=name)
384+
else:
385+
stamps = []
386+
for res, offset in zip(result, tzoffsets):
387+
ts = tslib.Timestamp(res)
388+
ts = ts.tz_localize(tzoffset)
389+
stamps.append(ts)
390+
result = np.array(stamps, dtype=object)
391+
else:
392+
stamps = []
393+
for res, offset in zip(result, tzoffsets):
394+
offset_mins = offset.total_seconds() / 60
395+
tzoffset = pytz.FixedOffset(offset_mins)
396+
ts = tslib.Timestamp(res)
397+
ts = ts.tz_localize(tzoffset)
398+
stamps.append(ts)
399+
result = np.array(stamps, dtype=object)
400+
return result
401+
elif parsing_tzoffset and parsing_tzname:
402+
if not PY3:
403+
raise ValueError("Parsing tzoffsets are not "
404+
"not supported in Python 3")
405+
from datetime import timezone
406+
stamps = []
407+
for res, offset, tzname in zip(result, tzoffsets,
408+
tznames):
409+
# Do we need to validate these timezones?
410+
# e.g. UTC / +0100
411+
tzinfo = timezone(offset, tzname)
412+
ts = tslib.Timestamp(res, tzinfo=tzinfo)
413+
stamps.append(ts)
414+
result = np.array(stamps, dtype=object)
415+
return result
416+
348417
except tslib.OutOfBoundsDatetime:
349418
if errors == 'raise':
350419
raise

pandas/tests/indexes/datetimes/test_tools.py

+58-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import dateutil
99
import numpy as np
1010
from dateutil.parser import parse
11-
from datetime import datetime, date, time
11+
from datetime import datetime, timedelta, date, time
1212
from distutils.version import LooseVersion
1313

1414
import pandas as pd
@@ -183,6 +183,63 @@ def test_to_datetime_format_weeks(self, cache):
183183
for s, format, dt in data:
184184
assert to_datetime(s, format=format, cache=cache) == dt
185185

186+
@pytest.mark.skipif(not PY3,
187+
reason="datetime.timezone not supported in PY2")
188+
def test_to_datetime_parse_timezone(self):
189+
from datetime import timezone
190+
# %Z parsing only
191+
fmt = '%Y-%m-%d %H:%M:%S %Z'
192+
dates = ['2010-01-01 12:00:00 UTC'] * 2
193+
result = pd.to_datetime(dates, format=fmt)
194+
expected_dates = [pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2
195+
expected = pd.DatetimeIndex(expected_dates)
196+
tm.assert_index_equal(result, expected)
197+
198+
result = pd.to_datetime(dates, format=fmt, box=False)
199+
expected = np.array(expected_dates, dtype=object)
200+
tm.assert_numpy_array_equal(result, expected)
201+
202+
dates = ['2010-01-01 12:00:00 UTC', '2010-01-01 12:00:00 GMT']
203+
result = pd.to_datetime(dates, format=fmt)
204+
expected_dates = [pd.Timestamp('2010-01-01 12:00:00', tz='UTC'),
205+
pd.Timestamp('2010-01-01 12:00:00', tz='GMT')]
206+
expected = np.array(expected_dates, dtype=object)
207+
tm.assert_numpy_array_equal(result, expected)
208+
209+
# %z parsing only
210+
dates = ['2010-01-01 12:00:00 +0100'] * 2
211+
fmt = '%Y-%m-%d %H:%M:%S %z'
212+
result = pd.to_datetime(dates, format=fmt)
213+
expected_dates = [pd.Timestamp('2010-01-01 12:00:00',
214+
tzinfo=pytz.FixedOffset(60))] * 2
215+
expected = pd.DatetimeIndex(expected_dates)
216+
tm.assert_index_equal(result, expected)
217+
218+
result = pd.to_datetime(dates, format=fmt, box=False)
219+
expected = np.array(expected_dates, dtype=object)
220+
tm.assert_numpy_array_equal(result, expected)
221+
222+
dates = ['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100']
223+
result = pd.to_datetime(dates, format=fmt)
224+
expected_dates = [pd.Timestamp('2010-01-01 12:00:00',
225+
tzinfo=pytz.FixedOffset(60)),
226+
pd.Timestamp('2010-01-01 12:00:00',
227+
tzinfo=pytz.FixedOffset(-60))]
228+
expected = np.array(expected_dates, dtype=object)
229+
tm.assert_numpy_array_equal(result, expected)
230+
231+
# %z and %Z parsing
232+
dates = ['2010-01-01 12:00:00 UTC +0100'] * 2
233+
fmt = '%Y-%m-%d %H:%M:%S %Z %z'
234+
result = pd.to_datetime(dates, format=fmt)
235+
tzinfo = timezone(timedelta(minutes=60), 'UTC')
236+
expected_dates = [pd.Timestamp('2010-01-01 13:00:00', tzinfo=tzinfo)]
237+
expected = np.array(expected_dates * 2, dtype=object)
238+
tm.assert_numpy_array_equal(result, expected)
239+
240+
with pytest.raises(ValueError):
241+
pd.to_datetime(dates, format=fmt, utc=True)
242+
186243

187244
class TestToDatetime(object):
188245
def test_to_datetime_pydatetime(self):

0 commit comments

Comments
 (0)