From 4c9b11647b92d59caad982791edc71a8075dcf59 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 29 Mar 2020 15:32:35 -0700 Subject: [PATCH 1/5] BUG: to_datetime with infer_datetime_format dropped timezone names --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/_libs/tslibs/parsing.pyx | 1 + pandas/tests/tools/test_to_datetime.py | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 58ac2b4cba3b7..a06ed50ef5999 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -312,7 +312,7 @@ Timedelta Timezones ^^^^^^^^^ -- +- Bug in :func:`to_datetime` with ``infer_datetime_format=True`` where timezone names (e.g. ``UTC``) would not be parsed correctly (:issue:``) - diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 74b95a2f3076f..5272a0a042d0e 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -805,6 +805,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, (('second',), '%S', 2), (('microsecond',), '%f', 6), (('second', 'microsecond'), '%S.%f', 0), + (('tzinfo',), '%Z', 0), ] if dayfirst: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index a751182dbf7af..4592d98674d26 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1862,6 +1862,12 @@ def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): pd.to_datetime(s, infer_datetime_format=True, cache=cache), ) + def test_infer_datetime_format_tz_name(self): + s = pd.Series(["2019-02-02 08:07:13 UTC"]) + result = to_datetime(s, infer_datetime_format=True) + expected = pd.Series([pd.Timestamp("2019-02-02 08:07:13").tz_localize("UTC")]) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_iso8601_noleading_0s(self, cache): # GH 11871 From c9cbeec6eef9e7d5b6d258a187f6edf534709b07 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 29 Mar 2020 15:33:44 -0700 Subject: [PATCH 2/5] Add issue number --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a06ed50ef5999..366eb60ec50c0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -312,7 +312,7 @@ Timedelta Timezones ^^^^^^^^^ -- Bug in :func:`to_datetime` with ``infer_datetime_format=True`` where timezone names (e.g. ``UTC``) would not be parsed correctly (:issue:``) +- Bug in :func:`to_datetime` with ``infer_datetime_format=True`` where timezone names (e.g. ``UTC``) would not be parsed correctly (:issue:`33133`) - From 48a4fbbb31e9a025fb0687e13c851092b35d0fee Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 29 Mar 2020 15:50:30 -0700 Subject: [PATCH 3/5] Clarify infer_datetime_format --- pandas/core/tools/datetimes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a2042de7f337e..207c5cc98449a 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -606,9 +606,9 @@ def to_datetime( would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : bool, default False If True and no `format` is given, attempt to infer the format of the - datetime strings, and if it can be inferred, switch to a faster - method of parsing them. In some cases this can increase the parsing - speed by ~5-10x. + datetime strings based on the first non-NaN element, + and if it can be inferred, switch to a faster method of parsing them. + In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. From 42d0e6fec36ccefd345de39235926141d72dbc02 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 30 Mar 2020 16:26:20 -0700 Subject: [PATCH 4/5] Add parameterized test --- pandas/tests/tools/test_to_datetime.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 4592d98674d26..f40ff1aa2668d 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1862,10 +1862,15 @@ def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): pd.to_datetime(s, infer_datetime_format=True, cache=cache), ) - def test_infer_datetime_format_tz_name(self): - s = pd.Series(["2019-02-02 08:07:13 UTC"]) + @pytest.mark.parametrize( + "tz_name, offset", [("UTC", 0), ("UTC-3", 180), ("UTC+3", -180)] + ) + def test_infer_datetime_format_tz_name(self, tz_name, offset): + s = pd.Series([f"2019-02-02 08:07:13 {tz_name}"]) result = to_datetime(s, infer_datetime_format=True) - expected = pd.Series([pd.Timestamp("2019-02-02 08:07:13").tz_localize("UTC")]) + expected = pd.Series( + [pd.Timestamp("2019-02-02 08:07:13").tz_localize(pytz.FixedOffset(offset))] + ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("cache", [True, False]) From 77a04340a21722f8951c377208f0f80a18277bcf Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 30 Mar 2020 18:26:17 -0700 Subject: [PATCH 5/5] Add GH ref --- pandas/tests/tools/test_to_datetime.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f40ff1aa2668d..d2049892705ea 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1866,6 +1866,7 @@ def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): "tz_name, offset", [("UTC", 0), ("UTC-3", 180), ("UTC+3", -180)] ) def test_infer_datetime_format_tz_name(self, tz_name, offset): + # GH 33133 s = pd.Series([f"2019-02-02 08:07:13 {tz_name}"]) result = to_datetime(s, infer_datetime_format=True) expected = pd.Series(