diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 4cbaa184791b8..769889dfbe75d 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -173,6 +173,7 @@ def setup(self): self.strings_tz_space = [ x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng ] + self.strings_zero_tz = [x.strftime("%Y-%m-%d %H:%M:%S") + "Z" for x in rng] def time_iso8601(self): to_datetime(self.strings) @@ -189,6 +190,10 @@ def time_iso8601_format_no_sep(self): def time_iso8601_tz_spaceformat(self): to_datetime(self.strings_tz_space) + def time_iso8601_infer_zero_tz_fromat(self): + # GH 41047 + to_datetime(self.strings_zero_tz, infer_datetime_format=True) + class ToDatetimeNONISO8601: def setup(self): diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8755ae851d474..8a78d63eab5c8 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -334,6 +334,7 @@ Timedelta Timezones ^^^^^^^^^ +- Bug in :func:`to_datetime` with ``infer_datetime_format=True`` failing to parse zero UTC offset (``Z``) correctly (:issue:`41047`) - Bug in :meth:`Series.dt.tz_convert` resetting index in a :class:`Series` with :class:`CategoricalIndex` (:issue:`43080`) - diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index cfa16df367bce..afbb63ecbd2d7 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -845,15 +845,17 @@ def format_is_iso(f: str) -> bint: Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different but must be consistent. Leading 0s in dates and times are optional. """ - iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format + iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}'.format excluded_formats = ['%Y%m%d', '%Y%m', '%Y'] for date_sep in [' ', '/', '\\', '-', '.', '']: for time_sep in [' ', 'T']: - if (iso_template(date_sep=date_sep, - time_sep=time_sep - ).startswith(f) and f not in excluded_formats): - return True + for micro_or_tz in ['', '%z', '%Z', '.%f', '.%f%z', '.%f%Z']: + if (iso_template(date_sep=date_sep, + time_sep=time_sep, + micro_or_tz=micro_or_tz, + ).startswith(f) and f not in excluded_formats): + return True return False @@ -907,6 +909,7 @@ def guess_datetime_format( (('second',), '%S', 2), (('microsecond',), '%f', 6), (('second', 'microsecond'), '%S.%f', 0), + (('tzinfo',), '%z', 0), (('tzinfo',), '%Z', 0), ] @@ -927,6 +930,33 @@ def guess_datetime_format( # that any user-provided function will not either. tokens = dt_str_split(dt_str) + # Normalize offset part of tokens. + # There are multiple formats for the timezone offset. + # To pass the comparison condition between the output of `strftime` and + # joined tokens, which is carried out at the final step of the function, + # the offset part of the tokens must match the '%z' format like '+0900' + # instead of ‘+09:00’. + if parsed_datetime.tzinfo is not None: + offset_index = None + if len(tokens) > 0 and tokens[-1] == 'Z': + # the last 'Z' means zero offset + offset_index = -1 + elif len(tokens) > 1 and tokens[-2] in ('+', '-'): + # ex. [..., '+', '0900'] + offset_index = -2 + elif len(tokens) > 3 and tokens[-4] in ('+', '-'): + # ex. [..., '+', '09', ':', '00'] + offset_index = -4 + + if offset_index is not None: + # If the input string has a timezone offset like '+0900', + # the offset is separated into two tokens, ex. ['+', '0900’]. + # This separation will prevent subsequent processing + # from correctly parsing the time zone format. + # So in addition to the format nomalization, we rejoin them here. + tokens[offset_index] = parsed_datetime.strftime("%z") + tokens = tokens[:offset_index + 1 or None] + format_guess = [None] * len(tokens) found_attrs = set() diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 469a5caf7d694..a38affbc7f723 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2032,6 +2032,23 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "ts,zero_tz,is_utc", + [ + ("2019-02-02 08:07:13", "Z", True), + ("2019-02-02 08:07:13", "", False), + ("2019-02-02 08:07:13.012345", "Z", True), + ("2019-02-02 08:07:13.012345", "", False), + ], + ) + def test_infer_datetime_format_zero_tz(self, ts, zero_tz, is_utc): + # GH 41047 + s = Series([ts + zero_tz]) + result = to_datetime(s, infer_datetime_format=True) + tz = pytz.utc if is_utc else None + expected = Series([Timestamp(ts, tz=tz)]) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_iso8601_noleading_0s(self, cache): # GH 11871 diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index e580b9112f3ec..3992457c9c361 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -144,6 +144,28 @@ def test_parsers_month_freq(date_str, expected): ("30-12-2011", "%d-%m-%Y"), ("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"), ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"), + ("2011-12-30T00:00:00UTC", "%Y-%m-%dT%H:%M:%S%Z"), + ("2011-12-30T00:00:00Z", "%Y-%m-%dT%H:%M:%S%z"), + ("2011-12-30T00:00:00+9", "%Y-%m-%dT%H:%M:%S%z"), + ("2011-12-30T00:00:00+09", "%Y-%m-%dT%H:%M:%S%z"), + ("2011-12-30T00:00:00+090", None), + ("2011-12-30T00:00:00+0900", "%Y-%m-%dT%H:%M:%S%z"), + ("2011-12-30T00:00:00-0900", "%Y-%m-%dT%H:%M:%S%z"), + ("2011-12-30T00:00:00+09:00", "%Y-%m-%dT%H:%M:%S%z"), + ("2011-12-30T00:00:00+09:000", "%Y-%m-%dT%H:%M:%S%z"), + ("2011-12-30T00:00:00+9:0", "%Y-%m-%dT%H:%M:%S%z"), + ("2011-12-30T00:00:00+09:", None), + ("2011-12-30T00:00:00.000000UTC", "%Y-%m-%dT%H:%M:%S.%f%Z"), + ("2011-12-30T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2011-12-30T00:00:00.000000+9", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2011-12-30T00:00:00.000000+09", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2011-12-30T00:00:00.000000+090", None), + ("2011-12-30T00:00:00.000000+0900", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2011-12-30T00:00:00.000000-0900", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2011-12-30T00:00:00.000000+09:00", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2011-12-30T00:00:00.000000+09:000", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2011-12-30T00:00:00.000000+09:", None), ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"), ], ) @@ -226,3 +248,31 @@ def test_parse_time_string_check_instance_type_raise_exception(): result = parse_time_string("2019") expected = (datetime(2019, 1, 1), "year") assert result == expected + + +@pytest.mark.parametrize( + "fmt,expected", + [ + ("%Y %m %d %H:%M:%S", True), + ("%Y/%m/%d %H:%M:%S", True), + (r"%Y\%m\%d %H:%M:%S", True), + ("%Y-%m-%d %H:%M:%S", True), + ("%Y.%m.%d %H:%M:%S", True), + ("%Y%m%d %H:%M:%S", True), + ("%Y-%m-%dT%H:%M:%S", True), + ("%Y-%m-%dT%H:%M:%S%z", True), + ("%Y-%m-%dT%H:%M:%S%Z", True), + ("%Y-%m-%dT%H:%M:%S.%f", True), + ("%Y-%m-%dT%H:%M:%S.%f%z", True), + ("%Y-%m-%dT%H:%M:%S.%f%Z", True), + ("%Y%m%d", False), + ("%Y%m", False), + ("%Y", False), + ("%Y-%m-%d", True), + ("%Y-%m", True), + ], +) +def test_is_iso_format(fmt, expected): + # see gh-41047 + result = parsing.format_is_iso(fmt) + assert result == expected