pandas-dev · jreback · May 29, 2018 · Mar 18, 2018 · Feb 22, 2018 · Mar 14, 2018
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -16,6 +16,7 @@ New features
 ~~~~~~~~~~~~
 
 - :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with MultiIndex (:issue:`21115`)
+- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
 
 
 .. _whatsnew_0231.deprecations:

diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
@@ -20,6 +20,7 @@ except:
         except:
             from _dummy_thread import allocate_lock as _thread_allocate_lock
 
+import pytz
 
 from cython cimport Py_ssize_t
 from cpython cimport PyFloat_Check
@@ -58,15 +59,36 @@ def array_strptime(ndarray[object] values, object fmt,
         Py_ssize_t i, n = len(values)
         pandas_datetimestruct dts
         ndarray[int64_t] iresult
-        int year, month, day, minute, hour, second, weekday, julian, tz
-        int week_of_year, week_of_year_start
+        ndarray[object] result_timezone
+        int year, month, day, minute, hour, second, weekday, julian
+        int week_of_year, week_of_year_start, parse_code
         int64_t us, ns
-        object val, group_key, ampm, found
+        object val, group_key, ampm, found, timezone
         dict found_key
         bint is_raise = errors=='raise'
         bint is_ignore = errors=='ignore'
         bint is_coerce = errors=='coerce'
         int ordinal
+        dict _parse_code_table = {'y': 0,
+                                  'Y': 1,
+                                  'm': 2,
+                                  'B': 3,
+                                  'b': 4,
+                                  'd': 5,
+                                  'H': 6,
+                                  'I': 7,
+                                  'M': 8,
+                                  'S': 9,
+                                  'f': 10,
+                                  'A': 11,
+                                  'a': 12,
+                                  'w': 13,
+                                  'j': 14,
+                                  'U': 15,
+                                  'W': 16,
+                                  'Z': 17,
+                                  'p': 18,  # an additional key, only with I
+                                  'z': 19}
 
     assert is_raise or is_ignore or is_coerce
 
@@ -108,32 +130,10 @@ def array_strptime(ndarray[object] values, object fmt,
 
     result = np.empty(n, dtype='M8[ns]')
     iresult = result.view('i8')
+    result_timezone = np.empty(n, dtype='object')
 
     dts.us = dts.ps = dts.as = 0
 
-    cdef dict _parse_code_table = {
-        'y': 0,
-        'Y': 1,
-        'm': 2,
-        'B': 3,
-        'b': 4,
-        'd': 5,
-        'H': 6,
-        'I': 7,
-        'M': 8,
-        'S': 9,
-        'f': 10,
-        'A': 11,
-        'a': 12,
-        'w': 13,
-        'j': 14,
-        'U': 15,
-        'W': 16,
-        'Z': 17,
-        'p': 18   # just an additional key, works only with I
-    }
-    cdef int parse_code
-
     for i in range(n):
         val = values[i]
         if is_string_object(val):
@@ -176,7 +176,7 @@ def array_strptime(ndarray[object] values, object fmt,
         year = 1900
         month = day = 1
         hour = minute = second = ns = us = 0
-        tz = -1
+        timezone = None
         # Default to -1 to signify that values not known; not critical to have,
         # though
         week_of_year = -1
@@ -266,21 +266,10 @@ def array_strptime(ndarray[object] values, object fmt,
                     # W starts week on Monday.
                     week_of_year_start = 0
             elif parse_code == 17:
-                # Since -1 is default value only need to worry about setting tz
-                # if it can be something other than -1.
-                found_zone = found_dict['Z'].lower()
-                for value, tz_values in enumerate(locale_time.timezone):
-                    if found_zone in tz_values:
-                        # Deal w/ bad locale setup where timezone names are the
-                        # same and yet time.daylight is true; too ambiguous to
-                        # be able to tell what timezone has daylight savings
-                        if (time.tzname[0] == time.tzname[1] and
-                            time.daylight and found_zone not in (
-                                "utc", "gmt")):
-                            break
-                        else:
-                            tz = value
-                            break
+                timezone = pytz.timezone(found_dict['Z'])
+            elif parse_code == 19:
+                timezone = parse_timezone_directive(found_dict['z'])
+
         # If we know the wk of the year and what day of that wk, we can figure
         # out the Julian day of the year.
         if julian == -1 and week_of_year != -1 and weekday != -1:
@@ -330,7 +319,9 @@ def array_strptime(ndarray[object] values, object fmt,
                 continue
             raise
 
-    return result
+        result_timezone[i] = timezone
+
+    return result, result_timezone
 
 
 """_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored
@@ -538,14 +529,13 @@ class TimeRE(dict):
             # XXX: Does 'Y' need to worry about having less or more than
             #     4 digits?
             'Y': r"(?P<Y>\d\d\d\d)",
+            'z': r"(?P<z>[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|Z)",
             'A': self.__seqToRE(self.locale_time.f_weekday, 'A'),
             'a': self.__seqToRE(self.locale_time.a_weekday, 'a'),
             'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'),
             'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'),
             'p': self.__seqToRE(self.locale_time.am_pm, 'p'),
-            'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone
-                                 for tz in tz_names],
-                                'Z'),
+            'Z': self.__seqToRE(pytz.all_timezones, 'Z'),
             '%': '%'})
         base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
         base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
@@ -632,3 +622,50 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year,
     else:
         days_to_week = week_0_length + (7 * (week_of_year - 1))
         return 1 + days_to_week + day_of_week
+
+cdef parse_timezone_directive(object z):
+    """
+    Parse the '%z' directive and return a pytz.FixedOffset
+
+    Parameters
+    ----------
+    z : string of the UTC offset
+
+    Returns
+    -------
+    pytz.FixedOffset
+
+    Notes
+    -----
+    This is essentially similar to the cpython implementation
+    https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479
+    """
+
+    cdef:
+        int gmtoff_fraction, hours, minutes, seconds, pad_number, microseconds
+        int total_minutes
+        object gmtoff_remainder, gmtoff_remainder_padding
+
+    if z == 'Z':
+        return pytz.FixedOffset(0)
+    if z[3] == ':':
+        z = z[:3] + z[4:]
+        if len(z) > 5:
+            if z[5] != ':':
+                msg = "Inconsistent use of : in {0}"
+                raise ValueError(msg.format(z))
+            z = z[:5] + z[6:]
+    hours = int(z[1:3])
+    minutes = int(z[3:5])
+    seconds = int(z[5:7] or 0)
+
+    # Pad to always return microseconds.
+    gmtoff_remainder = z[8:]
+    pad_number = 6 - len(gmtoff_remainder)
+    gmtoff_remainder_padding = "0" * pad_number
+    microseconds = int(gmtoff_remainder + gmtoff_remainder_padding)
+
+    total_minutes = ((hours * 60) + minutes + (seconds / 60) +
+                     (microseconds / 60000000))
+    total_minutes = -total_minutes if z.startswith("-") else total_minutes
+    return pytz.FixedOffset(total_minutes)
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -1,7 +1,8 @@
 from datetime import datetime, timedelta, time
-import numpy as np
 from collections import MutableMapping
 
+import numpy as np
+
 from pandas._libs import tslib
 from pandas._libs.tslibs.strptime import array_strptime
 from pandas._libs.tslibs import parsing, conversion
@@ -27,6 +28,7 @@
     ABCDataFrame)
 from pandas.core.dtypes.missing import notna
 from pandas.core import algorithms
+from pandas.compat import zip
 
 
 def _guess_datetime_format_for_array(arr, **kwargs):
@@ -103,6 +105,36 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
     return result.values
 
 
+def _return_parsed_timezone_results(result, timezones, box):
+    """
+    Return results from array_strptime if a %z or %Z directive was passed.
+
+    Parameters
+    ----------
+    result : ndarray
+        int64 date representations of the dates
+    timezones : ndarray
+        pytz timezone objects
+    box : boolean
+        True boxes result as an Index-like, False returns an ndarray
+
+    Returns
+    -------
+    tz_result : ndarray of parsed dates with timezone
+        Returns:
+
+        - Index-like if box=True
+        - ndarray of Timestamps if box=False
+
+    """
+    tz_results = np.array([tslib.Timestamp(res).tz_localize(tz) for res, tz
+                           in zip(result, timezones)])
+    if box:
+        from pandas import Index
+        return Index(tz_results)
+    return tz_results
+
+
 def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
                 utc=None, box=True, format=None, exact=True,
                 unit=None, infer_datetime_format=False, origin='unix',
@@ -343,8 +375,20 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
                 # fallback
                 if result is None:
                     try:
-                        result = array_strptime(arg, format, exact=exact,
-                                                errors=errors)
+                        parsing_tzname = '%Z' in format
+                        parsing_tzoffset = '%z' in format
+                        if parsing_tzoffset and parsing_tzname:
+                            raise ValueError("Cannot parse both %Z and %z")
+                        elif tz is not None and (parsing_tzname or
+                                                 parsing_tzoffset):
+                            raise ValueError("Cannot pass a tz argument when "
+                                             "parsing strings with timezone "
+                                             "information.")
+                        result, timezones = array_strptime(
+                            arg, format, exact=exact, errors=errors)
+                        if parsing_tzname or parsing_tzoffset:
+                            return _return_parsed_timezone_results(
+                                result, timezones, box)
                     except tslib.OutOfBoundsDatetime:
                         if errors == 'raise':
                             raise

diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py
@@ -179,6 +179,55 @@ def test_to_datetime_format_weeks(self, cache):
         for s, format, dt in data:
             assert to_datetime(s, format=format, cache=cache) == dt
 
+    @pytest.mark.parametrize("box,const,assert_equal", [
+        [True, pd.Index, 'assert_index_equal'],
+        [False, np.array, 'assert_numpy_array_equal']])
+    @pytest.mark.parametrize("fmt,dates,expected_dates", [
+        ['%Y-%m-%d %H:%M:%S %Z',
+         ['2010-01-01 12:00:00 UTC'] * 2,
+         [pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2],
+        ['%Y-%m-%d %H:%M:%S %Z',
+         ['2010-01-01 12:00:00 UTC',
+          '2010-01-01 12:00:00 GMT',
+          '2010-01-01 12:00:00 US/Pacific'],
+         [pd.Timestamp('2010-01-01 12:00:00', tz='UTC'),
+          pd.Timestamp('2010-01-01 12:00:00', tz='GMT'),
+          pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]],
+        ['%Y-%m-%d %H:%M:%S %z',
+         ['2010-01-01 12:00:00 +0100'] * 2,
+         [pd.Timestamp('2010-01-01 12:00:00',
+                       tzinfo=pytz.FixedOffset(60))] * 2],
+        ['%Y-%m-%d %H:%M:%S %z',
+         ['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'],
+         [pd.Timestamp('2010-01-01 12:00:00',
+                       tzinfo=pytz.FixedOffset(60)),
+          pd.Timestamp('2010-01-01 12:00:00',
+                       tzinfo=pytz.FixedOffset(-60))]],
+        ['%Y-%m-%d %H:%M:%S %z',
+         ['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'],
+         [pd.Timestamp('2010-01-01 12:00:00',
+                       tzinfo=pytz.FixedOffset(0)),
+          pd.Timestamp('2010-01-01 12:00:00',
+                       tzinfo=pytz.FixedOffset(0))]]])
+    def test_to_datetime_parse_tzname_or_tzoffset(self, box, const,
+                                                  assert_equal, fmt,
+                                                  dates, expected_dates):
+        # GH 13486
+        result = pd.to_datetime(dates, format=fmt, box=box)
+        expected = const(expected_dates)
+        getattr(tm, assert_equal)(result, expected)
+
+        with pytest.raises(ValueError):
+            pd.to_datetime(dates, format=fmt, box=box, utc=True)
+
+    @pytest.mark.parametrize('offset', [
+        '+0', '-1foo', 'UTCbar', ':10', '+01:000:01'])
+    def test_to_datetime_parse_timezone_malformed(self, offset):
+        fmt = '%Y-%m-%d %H:%M:%S %z'
+        date = '2010-01-01 12:00:00 ' + offset
+        with pytest.raises(ValueError):
+            pd.to_datetime([date], format=fmt)
+
 
 class TestToDatetime(object):
     def test_to_datetime_pydatetime(self):