From f1a4d5f781d6b9ab08bae8dcd30791e006968d8f Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 13 Nov 2022 10:41:56 -0800 Subject: [PATCH 1/6] BUG: rendering dt64tz values with non-pytz --- doc/source/whatsnew/v2.0.0.rst | 2 ++ pandas/_libs/tslibs/vectorized.pyi | 1 - pandas/_libs/tslibs/vectorized.pyx | 4 ++-- pandas/tests/arrays/test_datetimes.py | 29 +++++++++++++++++++++++++++ 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 032bcf09244e5..1f690902fcfdc 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -610,6 +610,8 @@ Datetimelike - Bug in :class:`DatetimeIndex` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``dtype`` or data (:issue:`48659`) - Bug in subtracting a ``datetime`` scalar from :class:`DatetimeIndex` failing to retain the original ``freq`` attribute (:issue:`48818`) - Bug in ``pandas.tseries.holiday.Holiday`` where a half-open date interval causes inconsistent return types from :meth:`USFederalHolidayCalendar.holidays` (:issue:`49075`) +- Bug in rendering :class:`DatetimeIndex` and :class:`Series` and :class:`DataFrame` with timezone-aware dtypes with ``dateutil`` or ``zoneinfo`` timezones near daylight-savings transitions (:issue:`??`) +- Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi index 22f457b9ddc0b..3fd9e2501e611 100644 --- a/pandas/_libs/tslibs/vectorized.pyi +++ b/pandas/_libs/tslibs/vectorized.pyi @@ -33,7 +33,6 @@ def get_resolution( def ints_to_pydatetime( arr: npt.NDArray[np.int64], tz: tzinfo | None = ..., - fold: bool = ..., box: str = ..., reso: int = ..., # NPY_DATETIMEUNIT ) -> npt.NDArray[np.object_]: ... diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 8661ba4b9b2f1..b95cebd60a847 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -94,7 +94,6 @@ def tz_convert_from_utc(ndarray stamps, tzinfo tz, NPY_DATETIMEUNIT reso=NPY_FR_ def ints_to_pydatetime( ndarray stamps, tzinfo tz=None, - bint fold=False, str box="datetime", NPY_DATETIMEUNIT reso=NPY_FR_ns, ) -> np.ndarray: @@ -136,6 +135,7 @@ def ints_to_pydatetime( tzinfo new_tz bint use_date = False, use_ts = False, use_pydt = False object res_val + bint fold = 0 # Note that `result` (and thus `result_flat`) is C-order and # `it` iterates C-order as well, so the iteration matches @@ -168,7 +168,7 @@ def ints_to_pydatetime( else: - local_val = info.utc_val_to_local_val(utc_val, &pos) + local_val = info.utc_val_to_local_val(utc_val, &pos, &fold) if info.use_pytz: # find right representation of dst etc in pytz timezone new_tz = tz._tzinfos[tz._transition_info[pos]] diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 166362a9a8c30..b13791715c33e 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -3,6 +3,7 @@ """ from datetime import timedelta import operator +from zoneinfo import ZoneInfo import numpy as np import pytest @@ -706,3 +707,31 @@ def test_tz_localize_t2d(self): roundtrip = expected.tz_localize("US/Pacific") tm.assert_datetime_array_equal(roundtrip, dta) + + @pytest.mark.parametrize( + "tz", [ZoneInfo("US/Eastern"), "US/Eastern", "dateutil/US/Eastern"] + ) + def test_iter_zoneinfo_fold(self, tz): + utc_vals = np.array( + [1320552000, 1320555600, 1320559200, 1320562800], dtype=np.int64 + ) + utc_vals *= 1_000_000_000 + + dta = DatetimeArray(utc_vals).tz_localize("UTC").tz_convert(tz) + + left = dta[2] + right = list(dta)[2] + assert str(left) == str(right) + # previously there was a bug where with non-pytz right would be + # Timestamp('2011-11-06 01:00:00-0400', tz='US/Eastern') + # while left would be + # Timestamp('2011-11-06 01:00:00-0500', tz='US/Eastern') + # The .value's would match (so they would compare as equal), + # but the folds would not + assert left.utcoffset() == right.utcoffset() + + # The same bug in ints_to_pydatetime affected .astype, so we test + # that here. + right2 = dta.astype(object)[2] + assert str(left) == str(right2) + assert left.utcoffset() == right2.utcoffset() From dff06c2833c2e3df1b3c9310b9ad85788df435c1 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 13 Nov 2022 10:46:38 -0800 Subject: [PATCH 2/6] GH ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/tests/arrays/test_datetimes.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 1f690902fcfdc..51ec4361a171e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -610,7 +610,7 @@ Datetimelike - Bug in :class:`DatetimeIndex` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``dtype`` or data (:issue:`48659`) - Bug in subtracting a ``datetime`` scalar from :class:`DatetimeIndex` failing to retain the original ``freq`` attribute (:issue:`48818`) - Bug in ``pandas.tseries.holiday.Holiday`` where a half-open date interval causes inconsistent return types from :meth:`USFederalHolidayCalendar.holidays` (:issue:`49075`) -- Bug in rendering :class:`DatetimeIndex` and :class:`Series` and :class:`DataFrame` with timezone-aware dtypes with ``dateutil`` or ``zoneinfo`` timezones near daylight-savings transitions (:issue:`??`) +- Bug in rendering :class:`DatetimeIndex` and :class:`Series` and :class:`DataFrame` with timezone-aware dtypes with ``dateutil`` or ``zoneinfo`` timezones near daylight-savings transitions (:issue:`49684`) - Timedelta diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index b13791715c33e..fde7e329c5a72 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -712,6 +712,7 @@ def test_tz_localize_t2d(self): "tz", [ZoneInfo("US/Eastern"), "US/Eastern", "dateutil/US/Eastern"] ) def test_iter_zoneinfo_fold(self, tz): + # GH#49684 utc_vals = np.array( [1320552000, 1320555600, 1320559200, 1320562800], dtype=np.int64 ) From 211b167d445567e7851f86a5128fa1432abfaccb Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 13 Nov 2022 11:25:26 -0800 Subject: [PATCH 3/6] py38 compat --- pandas/tests/arrays/test_datetimes.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index fde7e329c5a72..70656639237e9 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -3,7 +3,11 @@ """ from datetime import timedelta import operator -from zoneinfo import ZoneInfo + +try: + from zoneinfo import ZoneInfo +except ImportError: + ZoneInfo = None import numpy as np import pytest @@ -708,9 +712,11 @@ def test_tz_localize_t2d(self): roundtrip = expected.tz_localize("US/Pacific") tm.assert_datetime_array_equal(roundtrip, dta) - @pytest.mark.parametrize( - "tz", [ZoneInfo("US/Eastern"), "US/Eastern", "dateutil/US/Eastern"] - ) + easts = ["US/Eastern", "dateutil/US/Eastern"] + if ZoneInfo is not None: + easts.append(ZoneInfo("US/Eastern")) + + @pytest.mark.parametrize("tz", easts) def test_iter_zoneinfo_fold(self, tz): # GH#49684 utc_vals = np.array( From 5cdbb33bc9015c07d8b9901364f3ce0aad2f5c34 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Nov 2022 11:13:56 -0800 Subject: [PATCH 4/6] BUG: support ambiguous=infer with ZoneInfo --- pandas/_libs/tslibs/timezones.pxd | 1 + pandas/_libs/tslibs/tzconversion.pyx | 130 ++++++++++++++---- .../tests/indexes/datetimes/test_timezones.py | 11 +- .../tests/scalar/timestamp/test_timezones.py | 18 +++ 4 files changed, 130 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index c1a4e2bd5e1ac..31de2c5eeb8d5 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -6,6 +6,7 @@ from cpython.datetime cimport ( cdef tzinfo utc_pytz +cdef tzinfo utc_stdlib cpdef bint is_utc(tzinfo tz) cdef bint is_tzlocal(tzinfo tz) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 28259c9db26e5..16bf8a1292a61 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -36,6 +36,7 @@ from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, npy_datetimestruct, pandas_datetime_to_datetimestruct, + pydatetime_to_dt64, ) from pandas._libs.tslibs.timezones cimport ( get_dst_info, @@ -43,6 +44,7 @@ from pandas._libs.tslibs.timezones cimport ( is_tzlocal, is_utc, is_zoneinfo, + utc_stdlib, ) @@ -154,7 +156,7 @@ cdef int64_t tz_localize_to_utc_single( # TODO: test with non-nano return val - elif is_tzlocal(tz) or is_zoneinfo(tz): + elif is_tzlocal(tz): return val - _tz_localize_using_tzinfo_api(val, tz, to_utc=True, creso=creso) elif is_fixed_offset(tz): @@ -242,29 +244,6 @@ timedelta-like} if info.use_utc: return vals.copy() - result = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0) - - if info.use_tzlocal: - for i in range(n): - v = vals[i] - if v == NPY_NAT: - result[i] = NPY_NAT - else: - result[i] = v - _tz_localize_using_tzinfo_api( - v, tz, to_utc=True, creso=creso - ) - return result.base # to return underlying ndarray - - elif info.use_fixed: - delta = info.delta - for i in range(n): - v = vals[i] - if v == NPY_NAT: - result[i] = NPY_NAT - else: - result[i] = v - delta - return result.base # to return underlying ndarray - # silence false-positive compiler warning ambiguous_array = np.empty(0, dtype=bool) if isinstance(ambiguous, str): @@ -299,11 +278,39 @@ timedelta-like} "shift_backwards} or a timedelta object") raise ValueError(msg) + result = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0) + + if info.use_tzlocal and not is_zoneinfo(tz): + for i in range(n): + v = vals[i] + if v == NPY_NAT: + result[i] = NPY_NAT + else: + result[i] = v - _tz_localize_using_tzinfo_api( + v, tz, to_utc=True, creso=creso + ) + return result.base # to return underlying ndarray + + elif info.use_fixed: + delta = info.delta + for i in range(n): + v = vals[i] + if v == NPY_NAT: + result[i] = NPY_NAT + else: + result[i] = v - delta + return result.base # to return underlying ndarray + # Determine whether each date lies left of the DST transition (store in # result_a) or right of the DST transition (store in result_b) - result_a, result_b =_get_utc_bounds( - vals, info.tdata, info.ntrans, info.deltas, creso=creso - ) + if is_zoneinfo(tz): + result_a, result_b =_get_utc_bounds_zoneinfo( + vals, tz, creso=creso + ) + else: + result_a, result_b =_get_utc_bounds( + vals, info.tdata, info.ntrans, info.deltas, creso=creso + ) # silence false-positive compiler warning dst_hours = np.empty(0, dtype=np.int64) @@ -391,8 +398,7 @@ timedelta-like} return result.base # .base to get underlying ndarray -cdef inline Py_ssize_t bisect_right_i8(int64_t *data, - int64_t val, Py_ssize_t n): +cdef inline Py_ssize_t bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n): # Caller is responsible for checking n > 0 # This looks very similar to local_search_right in the ndarray.searchsorted # implementation. @@ -483,6 +489,72 @@ cdef _get_utc_bounds( return result_a, result_b +cdef _get_utc_bounds_zoneinfo(ndarray vals, tz, NPY_DATETIMEUNIT creso): + """ + For each point in 'vals', find the UTC time that it corresponds to if + with fold=0 and fold=1. In non-ambiguous cases, these will match. + + Parameters + ---------- + vals : ndarray[int64_t] + tz : ZoneInfo + creso : NPY_DATETIMEUNIT + + Returns + ------- + ndarray[int64_t] + ndarray[int64_t] + """ + cdef: + Py_ssize_t i, n = vals.size + npy_datetimestruct dts + datetime dt, left, right, aware, as_utc + int64_t val, pps = periods_per_second(creso) + ndarray result_a, result_b + + result_a = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0) + result_b = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0) + + for i in range(n): + val = vals[i] + if val == NPY_NAT: + result_a[i] = NPY_NAT + result_b[i] = NPY_NAT + continue + + pandas_datetime_to_datetimestruct(val, creso, &dts) + # casting to pydatetime drops nanoseconds etc, which we will + # need to re-add later as 'extra'' + extra = (dts.ps // 1000) * (pps // 1_000_000_000) + + dt = datetime_new(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, None) + + aware = dt.replace(tzinfo=tz) + as_utc = aware.astimezone(utc_stdlib) + rt = as_utc.astimezone(tz) + if aware != rt: + # AFAICT this means that 'aware' is non-existent + # TODO: better way to check this? + # mail.python.org/archives/list/datetime-sig@python.org/ + # thread/57Y3IQAASJOKHX4D27W463XTZIS2NR3M/ + result_a[i] = NPY_NAT + else: + left = as_utc.replace(tzinfo=None) + result_a[i] = pydatetime_to_dt64(left, &dts, creso) + extra + + aware = dt.replace(fold=1, tzinfo=tz) + as_utc = aware.astimezone(utc_stdlib) + rt = as_utc.astimezone(tz) + if aware != rt: + result_b[i] = NPY_NAT + else: + right = as_utc.replace(tzinfo=None) + result_b[i] = pydatetime_to_dt64(right, &dts, creso) + extra + + return result_a, result_b + + @cython.boundscheck(False) cdef ndarray[int64_t] _get_dst_hours( # vals, creso only needed here to potential render an exception message diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 0bc2862e55021..6d62e5298a163 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -18,6 +18,11 @@ import pytest import pytz +try: + from zoneinfo import ZoneInfo +except ImportError: + ZoneInfo = None + from pandas._libs.tslibs import ( conversion, timezones, @@ -355,7 +360,11 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self): expected = dti.tz_convert("US/Eastern") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + easts = [pytz.timezone("US/Eastern"), gettz("US/Eastern")] + if ZoneInfo is not None: + easts.append(ZoneInfo("US/Eastern")) + + @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 912b7d9232abe..3e02ab208c502 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -29,6 +29,11 @@ Timestamp, ) +try: + from zoneinfo import ZoneInfo +except ImportError: + ZoneInfo = None + class TestTimestampTZOperations: # -------------------------------------------------------------- @@ -70,6 +75,19 @@ def test_tz_localize_ambiguous_bool(self, unit): with pytest.raises(pytz.AmbiguousTimeError, match=msg): ts.tz_localize("US/Central") + with pytest.raises(pytz.AmbiguousTimeError, match=msg): + ts.tz_localize("dateutil/US/Central") + + if ZoneInfo is not None: + try: + tz = ZoneInfo("US/Central") + except KeyError: + # no tzdata + pass + else: + with pytest.raises(pytz.AmbiguousTimeError, match=msg): + ts.tz_localize(tz) + result = ts.tz_localize("US/Central", ambiguous=True) assert result == expected0 assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value From be568e2440059f67405037188b12421d4204b535 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Nov 2022 11:16:26 -0800 Subject: [PATCH 5/6] add type declaration --- pandas/_libs/tslibs/tzconversion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 16bf8a1292a61..99855b36e8676 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -508,7 +508,7 @@ cdef _get_utc_bounds_zoneinfo(ndarray vals, tz, NPY_DATETIMEUNIT creso): cdef: Py_ssize_t i, n = vals.size npy_datetimestruct dts - datetime dt, left, right, aware, as_utc + datetime dt, rt, left, right, aware, as_utc int64_t val, pps = periods_per_second(creso) ndarray result_a, result_b From 6bb724e02b042bca576024840d017dc8bdb250c9 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Nov 2022 15:31:25 -0800 Subject: [PATCH 6/6] no-tzdata compat --- pandas/tests/arrays/test_datetimes.py | 8 +++++++- pandas/tests/indexes/datetimes/test_timezones.py | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 70656639237e9..cf894ec185db0 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -714,7 +714,13 @@ def test_tz_localize_t2d(self): easts = ["US/Eastern", "dateutil/US/Eastern"] if ZoneInfo is not None: - easts.append(ZoneInfo("US/Eastern")) + try: + tz = ZoneInfo("US/Eastern") + except KeyError: + # no tzdata + pass + else: + easts.append(tz) @pytest.mark.parametrize("tz", easts) def test_iter_zoneinfo_fold(self, tz): diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 6d62e5298a163..8d651efe336e8 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -362,7 +362,13 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self): easts = [pytz.timezone("US/Eastern"), gettz("US/Eastern")] if ZoneInfo is not None: - easts.append(ZoneInfo("US/Eastern")) + try: + tz = ZoneInfo("US/Eastern") + except KeyError: + # no tzdata + pass + else: + easts.append(tz) @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer(self, tz):