pandas-dev · jreback · Sep 14, 2021 · Jun 17, 2021 · Jun 17, 2021 · Jun 19, 2021
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -932,6 +932,7 @@ Timezones
 ^^^^^^^^^
 - Bug in different ``tzinfo`` objects representing UTC not being treated as equivalent (:issue:`39216`)
 - Bug in ``dateutil.tz.gettz("UTC")`` not being recognized as equivalent to other UTC-representing tzinfos (:issue:`39276`)
+- Bug in :func:`to_datetime` with ``infer_datetime_format=True`` failing to parse zero UTC offset (``Z``) correctly (:issue:`41047`)
 
 Numeric
 ^^^^^^^

diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -822,15 +822,17 @@ def format_is_iso(f: str) -> bint:
     Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
     but must be consistent.  Leading 0s in dates and times are optional.
     """
-    iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format
+    iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}'.format
     excluded_formats = ['%Y%m%d', '%Y%m', '%Y']
 
     for date_sep in [' ', '/', '\\', '-', '.', '']:
         for time_sep in [' ', 'T']:
-            if (iso_template(date_sep=date_sep,
-                             time_sep=time_sep
-                             ).startswith(f) and f not in excluded_formats):
-                return True
+            for micro_or_tz in ['', '%z', '%Z', '.%f', '.%f%z', '.%f%Z']:
+                if (iso_template(date_sep=date_sep,
+                                 time_sep=time_sep,
+                                 micro_or_tz=micro_or_tz,
+                                 ).startswith(f) and f not in excluded_formats):
+                    return True
     return False
 
 
@@ -884,6 +886,7 @@ def guess_datetime_format(
         (('second',), '%S', 2),
         (('microsecond',), '%f', 6),
         (('second', 'microsecond'), '%S.%f', 0),
+        (('tzinfo',), '%z', 0),
         (('tzinfo',), '%Z', 0),
     ]
 
@@ -904,6 +907,53 @@ def guess_datetime_format(
     #  that any user-provided function will not either.
     tokens = dt_str_split(dt_str)
 
+    # Normalize offset part of tokens.
+    # There are multiple formats for the timezone offset.
+    # To pass the comparison condition between the output of `strftime` and
+    # joined tokens, which is carried out at the final step of the function,
+    # the offset part of the tokens must match the '%z' format like '+0900'
+    # instead of ‘+09:00’.
+    if (parsed_datetime.tzinfo is not None):
+        if (len(tokens) > 0) and (tokens[-1] == 'Z'):
+            # the last 'Z' means zero offset
+            tokens[-1] = '+0000'
+        else:
+            # If the input string has a timezone offset like '+0900',
+            # the offset is separated into two tokens, ex. ['+', '0900’].
+            # This separation will prevent subsequent processing
+            # from correctly parsing the time zone format.
+            # So in addition to the format nomalization, we rejoin them here.
+            if (
+                (len(tokens) > 3)
+                and tokens[-1].isdigit()
+                and (tokens[-2] == ':')
+                and tokens[-3].isdigit()
+                and (tokens[-4] in ('+', '-'))
+            ):
+                # ex. [..., '+', '9', ':', '5'] -> [..., '+0905']
+                offset_idx = -4
+                sign, hour_offset, _, min_offset = tokens[offset_idx:]
+                tokens[offset_idx] = (
+                    f'{sign}{int(hour_offset):02d}{int(min_offset):02d}'
+                )
+                tokens = tokens[:offset_idx + 1]
+            elif (
+                (len(tokens) > 1)
+                and tokens[-1].isdigit()
+                and (tokens[-2] in ('+', '-'))
+            ):
+                # ex. [..., '+', '0905'] -> [..., '+0905']
+                offset_idx = -2
+                sign, offset = tokens[offset_idx:]
+                if len(offset) <= 2:
+                    # '+09' -> '+0900'
+                    tokens[offset_idx] = f'{sign}{int(offset):02d}00'
+                else:
+                    tokens[offset_idx] = f'{sign}{int(offset):04d}'
+                tokens = tokens[:offset_idx + 1]
+
+            # else: Other patterns are tried to parse as a timezone name.
+
     format_guess = [None] * len(tokens)
     found_attrs = set()
 

diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
@@ -1941,6 +1941,23 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset):
         )
         tm.assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize(
+        "ts,zero_tz,is_utc",
+        [
+            ("2019-02-02 08:07:13", "Z", True),
+            ("2019-02-02 08:07:13", "", False),
+            ("2019-02-02 08:07:13.012345", "Z", True),
+            ("2019-02-02 08:07:13.012345", "", False),
+        ],
+    )
+    def test_infer_datetime_format_zero_tz(self, ts, zero_tz, is_utc):
+        # GH 41047
+        s = Series([ts + zero_tz])
+        result = to_datetime(s, infer_datetime_format=True)
+        tz = pytz.utc if is_utc else None
+        expected = Series([Timestamp(ts, tz=tz)])
+        tm.assert_series_equal(result, expected)
+
     @pytest.mark.parametrize("cache", [True, False])
     def test_to_datetime_iso8601_noleading_0s(self, cache):
         # GH 11871

diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py
@@ -144,6 +144,28 @@ def test_parsers_month_freq(date_str, expected):
         ("30-12-2011", "%d-%m-%Y"),
         ("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"),
         ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"),
+        ("2011-12-30T00:00:00UTC", "%Y-%m-%dT%H:%M:%S%Z"),
+        ("2011-12-30T00:00:00Z", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+9", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+09", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+090", None),
+        ("2011-12-30T00:00:00+0900", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00-0900", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+09:00", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+09:000", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+9:0", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+09:", None),
+        ("2011-12-30T00:00:00.000000UTC", "%Y-%m-%dT%H:%M:%S.%f%Z"),
+        ("2011-12-30T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+9", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+09", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+090", None),
+        ("2011-12-30T00:00:00.000000+0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000-0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+09:00", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+09:000", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+09:", None),
         ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"),
     ],
 )
@@ -226,3 +248,31 @@ def test_parse_time_string_check_instance_type_raise_exception():
     result = parse_time_string("2019")
     expected = (datetime(2019, 1, 1), "year")
     assert result == expected
+
+
+@pytest.mark.parametrize(
+    "fmt,expected",
+    [
+        ("%Y %m %d %H:%M:%S", True),
+        ("%Y/%m/%d %H:%M:%S", True),
+        (r"%Y\%m\%d %H:%M:%S", True),
+        ("%Y-%m-%d %H:%M:%S", True),
+        ("%Y.%m.%d %H:%M:%S", True),
+        ("%Y%m%d %H:%M:%S", True),
+        ("%Y-%m-%dT%H:%M:%S", True),
+        ("%Y-%m-%dT%H:%M:%S%z", True),
+        ("%Y-%m-%dT%H:%M:%S%Z", True),
+        ("%Y-%m-%dT%H:%M:%S.%f", True),
+        ("%Y-%m-%dT%H:%M:%S.%f%z", True),
+        ("%Y-%m-%dT%H:%M:%S.%f%Z", True),
+        ("%Y%m%d", False),
+        ("%Y%m", False),
+        ("%Y", False),
+        ("%Y-%m-%d", True),
+        ("%Y-%m", True),
+    ],
+)
+def test_is_iso_format(fmt, expected):
+    # see gh-41047
+    result = parsing.format_is_iso(fmt)
+    assert result == expected