Skip to content

BUG: support ambiguous=infer with ZoneInfo #49700

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 23, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,8 @@ Datetimelike
- Bug in :class:`DatetimeIndex` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``dtype`` or data (:issue:`48659`)
- Bug in subtracting a ``datetime`` scalar from :class:`DatetimeIndex` failing to retain the original ``freq`` attribute (:issue:`48818`)
- Bug in ``pandas.tseries.holiday.Holiday`` where a half-open date interval causes inconsistent return types from :meth:`USFederalHolidayCalendar.holidays` (:issue:`49075`)
- Bug in rendering :class:`DatetimeIndex` and :class:`Series` and :class:`DataFrame` with timezone-aware dtypes with ``dateutil`` or ``zoneinfo`` timezones near daylight-savings transitions (:issue:`49684`)
-

Timedelta
^^^^^^^^^
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/timezones.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ from cpython.datetime cimport (


cdef tzinfo utc_pytz
cdef tzinfo utc_stdlib

cpdef bint is_utc(tzinfo tz)
cdef bint is_tzlocal(tzinfo tz)
Expand Down
130 changes: 101 additions & 29 deletions pandas/_libs/tslibs/tzconversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,15 @@ from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
npy_datetimestruct,
pandas_datetime_to_datetimestruct,
pydatetime_to_dt64,
)
from pandas._libs.tslibs.timezones cimport (
get_dst_info,
is_fixed_offset,
is_tzlocal,
is_utc,
is_zoneinfo,
utc_stdlib,
)


Expand Down Expand Up @@ -154,7 +156,7 @@ cdef int64_t tz_localize_to_utc_single(
# TODO: test with non-nano
return val

elif is_tzlocal(tz) or is_zoneinfo(tz):
elif is_tzlocal(tz):
return val - _tz_localize_using_tzinfo_api(val, tz, to_utc=True, creso=creso)

elif is_fixed_offset(tz):
Expand Down Expand Up @@ -242,29 +244,6 @@ timedelta-like}
if info.use_utc:
return vals.copy()

result = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0)

if info.use_tzlocal:
for i in range(n):
v = vals[i]
if v == NPY_NAT:
result[i] = NPY_NAT
else:
result[i] = v - _tz_localize_using_tzinfo_api(
v, tz, to_utc=True, creso=creso
)
return result.base # to return underlying ndarray

elif info.use_fixed:
delta = info.delta
for i in range(n):
v = vals[i]
if v == NPY_NAT:
result[i] = NPY_NAT
else:
result[i] = v - delta
return result.base # to return underlying ndarray

# silence false-positive compiler warning
ambiguous_array = np.empty(0, dtype=bool)
if isinstance(ambiguous, str):
Expand Down Expand Up @@ -299,11 +278,39 @@ timedelta-like}
"shift_backwards} or a timedelta object")
raise ValueError(msg)

result = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0)

if info.use_tzlocal and not is_zoneinfo(tz):
for i in range(n):
v = vals[i]
if v == NPY_NAT:
result[i] = NPY_NAT
else:
result[i] = v - _tz_localize_using_tzinfo_api(
v, tz, to_utc=True, creso=creso
)
return result.base # to return underlying ndarray

elif info.use_fixed:
delta = info.delta
for i in range(n):
v = vals[i]
if v == NPY_NAT:
result[i] = NPY_NAT
else:
result[i] = v - delta
return result.base # to return underlying ndarray

# Determine whether each date lies left of the DST transition (store in
# result_a) or right of the DST transition (store in result_b)
result_a, result_b =_get_utc_bounds(
vals, info.tdata, info.ntrans, info.deltas, creso=creso
)
if is_zoneinfo(tz):
result_a, result_b =_get_utc_bounds_zoneinfo(
vals, tz, creso=creso
)
else:
result_a, result_b =_get_utc_bounds(
vals, info.tdata, info.ntrans, info.deltas, creso=creso
)

# silence false-positive compiler warning
dst_hours = np.empty(0, dtype=np.int64)
Expand Down Expand Up @@ -391,8 +398,7 @@ timedelta-like}
return result.base # .base to get underlying ndarray


cdef inline Py_ssize_t bisect_right_i8(int64_t *data,
int64_t val, Py_ssize_t n):
cdef inline Py_ssize_t bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n):
# Caller is responsible for checking n > 0
# This looks very similar to local_search_right in the ndarray.searchsorted
# implementation.
Expand Down Expand Up @@ -483,6 +489,72 @@ cdef _get_utc_bounds(
return result_a, result_b


cdef _get_utc_bounds_zoneinfo(ndarray vals, tz, NPY_DATETIMEUNIT creso):
"""
For each point in 'vals', find the UTC time that it corresponds to if
with fold=0 and fold=1. In non-ambiguous cases, these will match.

Parameters
----------
vals : ndarray[int64_t]
tz : ZoneInfo
creso : NPY_DATETIMEUNIT

Returns
-------
ndarray[int64_t]
ndarray[int64_t]
"""
cdef:
Py_ssize_t i, n = vals.size
npy_datetimestruct dts
datetime dt, rt, left, right, aware, as_utc
int64_t val, pps = periods_per_second(creso)
ndarray result_a, result_b

result_a = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0)
result_b = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0)

for i in range(n):
val = vals[i]
if val == NPY_NAT:
result_a[i] = NPY_NAT
result_b[i] = NPY_NAT
continue

pandas_datetime_to_datetimestruct(val, creso, &dts)
# casting to pydatetime drops nanoseconds etc, which we will
# need to re-add later as 'extra''
extra = (dts.ps // 1000) * (pps // 1_000_000_000)

dt = datetime_new(dts.year, dts.month, dts.day, dts.hour,
dts.min, dts.sec, dts.us, None)

aware = dt.replace(tzinfo=tz)
as_utc = aware.astimezone(utc_stdlib)
rt = as_utc.astimezone(tz)
if aware != rt:
# AFAICT this means that 'aware' is non-existent
# TODO: better way to check this?
# mail.python.org/archives/list/[email protected]/
# thread/57Y3IQAASJOKHX4D27W463XTZIS2NR3M/
result_a[i] = NPY_NAT
else:
left = as_utc.replace(tzinfo=None)
result_a[i] = pydatetime_to_dt64(left, &dts, creso) + extra

aware = dt.replace(fold=1, tzinfo=tz)
as_utc = aware.astimezone(utc_stdlib)
rt = as_utc.astimezone(tz)
if aware != rt:
result_b[i] = NPY_NAT
else:
right = as_utc.replace(tzinfo=None)
result_b[i] = pydatetime_to_dt64(right, &dts, creso) + extra

return result_a, result_b


@cython.boundscheck(False)
cdef ndarray[int64_t] _get_dst_hours(
# vals, creso only needed here to potential render an exception message
Expand Down
1 change: 0 additions & 1 deletion pandas/_libs/tslibs/vectorized.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def get_resolution(
def ints_to_pydatetime(
arr: npt.NDArray[np.int64],
tz: tzinfo | None = ...,
fold: bool = ...,
box: str = ...,
reso: int = ..., # NPY_DATETIMEUNIT
) -> npt.NDArray[np.object_]: ...
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/tslibs/vectorized.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ def tz_convert_from_utc(ndarray stamps, tzinfo tz, NPY_DATETIMEUNIT reso=NPY_FR_
def ints_to_pydatetime(
ndarray stamps,
tzinfo tz=None,
bint fold=False,
str box="datetime",
NPY_DATETIMEUNIT reso=NPY_FR_ns,
) -> np.ndarray:
Expand Down Expand Up @@ -136,6 +135,7 @@ def ints_to_pydatetime(
tzinfo new_tz
bint use_date = False, use_ts = False, use_pydt = False
object res_val
bint fold = 0

# Note that `result` (and thus `result_flat`) is C-order and
# `it` iterates C-order as well, so the iteration matches
Expand Down Expand Up @@ -168,7 +168,7 @@ def ints_to_pydatetime(

else:

local_val = info.utc_val_to_local_val(utc_val, &pos)
local_val = info.utc_val_to_local_val(utc_val, &pos, &fold)
if info.use_pytz:
# find right representation of dst etc in pytz timezone
new_tz = tz._tzinfos[tz._transition_info[pos]]
Expand Down
42 changes: 42 additions & 0 deletions pandas/tests/arrays/test_datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
from datetime import timedelta
import operator

try:
from zoneinfo import ZoneInfo
except ImportError:
ZoneInfo = None

import numpy as np
import pytest

Expand Down Expand Up @@ -706,3 +711,40 @@ def test_tz_localize_t2d(self):

roundtrip = expected.tz_localize("US/Pacific")
tm.assert_datetime_array_equal(roundtrip, dta)

easts = ["US/Eastern", "dateutil/US/Eastern"]
if ZoneInfo is not None:
try:
tz = ZoneInfo("US/Eastern")
except KeyError:
# no tzdata
pass
else:
easts.append(tz)

@pytest.mark.parametrize("tz", easts)
def test_iter_zoneinfo_fold(self, tz):
# GH#49684
utc_vals = np.array(
[1320552000, 1320555600, 1320559200, 1320562800], dtype=np.int64
)
utc_vals *= 1_000_000_000

dta = DatetimeArray(utc_vals).tz_localize("UTC").tz_convert(tz)

left = dta[2]
right = list(dta)[2]
assert str(left) == str(right)
# previously there was a bug where with non-pytz right would be
# Timestamp('2011-11-06 01:00:00-0400', tz='US/Eastern')
# while left would be
# Timestamp('2011-11-06 01:00:00-0500', tz='US/Eastern')
# The .value's would match (so they would compare as equal),
# but the folds would not
assert left.utcoffset() == right.utcoffset()

# The same bug in ints_to_pydatetime affected .astype, so we test
# that here.
right2 = dta.astype(object)[2]
assert str(left) == str(right2)
assert left.utcoffset() == right2.utcoffset()
17 changes: 16 additions & 1 deletion pandas/tests/indexes/datetimes/test_timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
import pytest
import pytz

try:
from zoneinfo import ZoneInfo
except ImportError:
ZoneInfo = None

from pandas._libs.tslibs import (
conversion,
timezones,
Expand Down Expand Up @@ -355,7 +360,17 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self):
expected = dti.tz_convert("US/Eastern")
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")])
easts = [pytz.timezone("US/Eastern"), gettz("US/Eastern")]
if ZoneInfo is not None:
try:
tz = ZoneInfo("US/Eastern")
except KeyError:
# no tzdata
pass
else:
easts.append(tz)

@pytest.mark.parametrize("tz", easts)
def test_dti_tz_localize_ambiguous_infer(self, tz):
# November 6, 2011, fall back, repeat 2 AM hour
# With no repeated hours, we cannot infer the transition
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/scalar/timestamp/test_timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
Timestamp,
)

try:
from zoneinfo import ZoneInfo
except ImportError:
ZoneInfo = None


class TestTimestampTZOperations:
# --------------------------------------------------------------
Expand Down Expand Up @@ -70,6 +75,19 @@ def test_tz_localize_ambiguous_bool(self, unit):
with pytest.raises(pytz.AmbiguousTimeError, match=msg):
ts.tz_localize("US/Central")

with pytest.raises(pytz.AmbiguousTimeError, match=msg):
ts.tz_localize("dateutil/US/Central")

if ZoneInfo is not None:
try:
tz = ZoneInfo("US/Central")
except KeyError:
# no tzdata
pass
else:
with pytest.raises(pytz.AmbiguousTimeError, match=msg):
ts.tz_localize(tz)

result = ts.tz_localize("US/Central", ambiguous=True)
assert result == expected0
assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value
Expand Down