Skip to content

Commit cedd122

Browse files
authored
DEPR: __setitem__ on dt64tz with mixed timezones (#49454)
* BUG: Series(mixed_tz_objs, dtype=dt64tz) * whatsnew * update pyi * DEPR: __setitem__ on dt64tz with mixed timezones
1 parent aebd229 commit cedd122

File tree

17 files changed

+210
-225
lines changed

17 files changed

+210
-225
lines changed

doc/source/whatsnew/v2.0.0.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -320,13 +320,15 @@ Removal of prior version deprecations/changes
320320
- Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`)
321321
- Changed behavior of :class:`Timestamp` constructor with a ``np.datetime64`` object and a ``tz`` passed to interpret the input as a wall-time as opposed to a UTC time (:issue:`42288`)
322322
- Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)
323+
- Changed behavior of setitem-like operations (``__setitem__``, ``fillna``, ``where``, ``mask``, ``replace``, ``insert``, fill_value for ``shift``) on an object with :class:`DatetimeTZDtype` when using a value with a non-matching timezone, the value will be cast to the object's timezone instead of casting both to object-dtype (:issue:`44243`)
323324
- Changed behavior of :class:`Index`, :class:`Series`, :class:`DataFrame` constructors with floating-dtype data and a :class:`DatetimeTZDtype`, the data are now interpreted as UTC-times instead of wall-times, consistent with how integer-dtype data are treated (:issue:`45573`)
324325
- Removed the deprecated ``base`` and ``loffset`` arguments from :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample` and :class:`pandas.Grouper`. Use ``offset`` or ``origin`` instead (:issue:`31809`)
325326
- Change the default argument of ``regex`` for :meth:`Series.str.replace` from ``True`` to ``False``. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal. (:issue:`36695`, :issue:`24804`)
326327
- Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`)
327328
- Changed behavior of comparison of a :class:`Timestamp` with a ``datetime.date`` object; these now compare as un-equal and raise on inequality comparisons, matching the ``datetime.datetime`` behavior (:issue:`36131`)
328329
- Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`)
329330
- Change behavior of :meth:`DataFrame.apply` with list-like so that any partial failure will raise an error (:issue:`43740`)
331+
-
330332

331333
.. ---------------------------------------------------------------------------
332334
.. _whatsnew_200.performance:
@@ -387,7 +389,7 @@ Timedelta
387389

388390
Timezones
389391
^^^^^^^^^
390-
-
392+
- Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` with object-dtype containing multiple timezone-aware ``datetime`` objects with heterogeneous timezones to a :class:`DatetimeTZDtype` incorrectly raising (:issue:`32581`)
391393
-
392394

393395
Numeric

pandas/_libs/tslib.pyi

+4
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,7 @@ def array_to_datetime(
2828
) -> tuple[np.ndarray, tzinfo | None]: ...
2929

3030
# returned ndarray may be object dtype or datetime64[ns]
31+
32+
def array_to_datetime_with_tz(
33+
values: npt.NDArray[np.object_], tz: tzinfo
34+
) -> npt.NDArray[np.int64]: ...

pandas/_libs/tslib.pyx

+48
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ from cpython.datetime cimport (
66
import_datetime,
77
tzinfo,
88
)
9+
from cpython.object cimport PyObject
910

1011
# import datetime C API
1112
import_datetime()
@@ -862,3 +863,50 @@ cdef inline bint _parse_today_now(str val, int64_t* iresult, bint utc):
862863
iresult[0] = Timestamp.today().value
863864
return True
864865
return False
866+
867+
868+
def array_to_datetime_with_tz(ndarray values, tzinfo tz):
869+
"""
870+
Vectorized analogue to pd.Timestamp(value, tz=tz)
871+
872+
values has object-dtype, unrestricted ndim.
873+
874+
Major differences between this and array_to_datetime with utc=True
875+
- np.datetime64 objects are treated as _wall_ times.
876+
- tznaive datetimes are treated as _wall_ times.
877+
"""
878+
cdef:
879+
ndarray result = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_INT64, 0)
880+
cnp.broadcast mi = cnp.PyArray_MultiIterNew2(result, values)
881+
Py_ssize_t i, n = values.size
882+
object item
883+
int64_t ival
884+
datetime ts
885+
886+
for i in range(n):
887+
# Analogous to `item = values[i]`
888+
item = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
889+
890+
if checknull_with_nat_and_na(item):
891+
# this catches pd.NA which would raise in the Timestamp constructor
892+
ival = NPY_NAT
893+
894+
else:
895+
ts = Timestamp(item)
896+
if ts is NaT:
897+
ival = NPY_NAT
898+
else:
899+
if ts.tz is not None:
900+
ts = ts.tz_convert(tz)
901+
else:
902+
# datetime64, tznaive pydatetime, int, float
903+
ts = ts.tz_localize(tz)
904+
ts = ts._as_unit("ns")
905+
ival = ts.value
906+
907+
# Analogous to: result[i] = ival
908+
(<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = ival
909+
910+
cnp.PyArray_MultiIter_NEXT(mi)
911+
912+
return result

pandas/core/arrays/datetimelike.py

+8-16
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def _scalar_from_string(self, value: str) -> DTScalarOrNaT:
217217
raise AbstractMethodError(self)
218218

219219
def _unbox_scalar(
220-
self, value: DTScalarOrNaT, setitem: bool = False
220+
self, value: DTScalarOrNaT
221221
) -> np.int64 | np.datetime64 | np.timedelta64:
222222
"""
223223
Unbox the integer value of a scalar `value`.
@@ -226,8 +226,6 @@ def _unbox_scalar(
226226
----------
227227
value : Period, Timestamp, Timedelta, or NaT
228228
Depending on subclass.
229-
setitem : bool, default False
230-
Whether to check compatibility with setitem strictness.
231229
232230
Returns
233231
-------
@@ -240,9 +238,7 @@ def _unbox_scalar(
240238
"""
241239
raise AbstractMethodError(self)
242240

243-
def _check_compatible_with(
244-
self, other: DTScalarOrNaT, setitem: bool = False
245-
) -> None:
241+
def _check_compatible_with(self, other: DTScalarOrNaT) -> None:
246242
"""
247243
Verify that `self` and `other` are compatible.
248244
@@ -255,9 +251,6 @@ def _check_compatible_with(
255251
Parameters
256252
----------
257253
other
258-
setitem : bool, default False
259-
For __setitem__ we may have stricter compatibility restrictions than
260-
for comparisons.
261254
262255
Raises
263256
------
@@ -663,7 +656,7 @@ def _validate_scalar(
663656
# this option exists to prevent a performance hit in
664657
# TimedeltaIndex.get_loc
665658
return value
666-
return self._unbox_scalar(value, setitem=setitem)
659+
return self._unbox_scalar(value)
667660

668661
def _validation_error_message(self, value, allow_listlike: bool = False) -> str:
669662
"""
@@ -757,19 +750,18 @@ def _validate_setitem_value(self, value):
757750
else:
758751
return self._validate_scalar(value, allow_listlike=True)
759752

760-
return self._unbox(value, setitem=True)
753+
return self._unbox(value)
761754

762-
def _unbox(
763-
self, other, setitem: bool = False
764-
) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarray:
755+
@final
756+
def _unbox(self, other) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarray:
765757
"""
766758
Unbox either a scalar with _unbox_scalar or an instance of our own type.
767759
"""
768760
if lib.is_scalar(other):
769-
other = self._unbox_scalar(other, setitem=setitem)
761+
other = self._unbox_scalar(other)
770762
else:
771763
# same type as self
772-
self._check_compatible_with(other, setitem=setitem)
764+
self._check_compatible_with(other)
773765
other = other._ndarray
774766
return other
775767

pandas/core/arrays/datetimes.py

+8-20
Original file line numberDiff line numberDiff line change
@@ -484,36 +484,19 @@ def _generate_range( # type: ignore[override]
484484
# -----------------------------------------------------------------
485485
# DatetimeLike Interface
486486

487-
def _unbox_scalar(self, value, setitem: bool = False) -> np.datetime64:
487+
def _unbox_scalar(self, value) -> np.datetime64:
488488
if not isinstance(value, self._scalar_type) and value is not NaT:
489489
raise ValueError("'value' should be a Timestamp.")
490-
self._check_compatible_with(value, setitem=setitem)
490+
self._check_compatible_with(value)
491491
return value.asm8
492492

493493
def _scalar_from_string(self, value) -> Timestamp | NaTType:
494494
return Timestamp(value, tz=self.tz)
495495

496-
def _check_compatible_with(self, other, setitem: bool = False):
496+
def _check_compatible_with(self, other) -> None:
497497
if other is NaT:
498498
return
499499
self._assert_tzawareness_compat(other)
500-
if setitem:
501-
# Stricter check for setitem vs comparison methods
502-
if self.tz is not None and not timezones.tz_compare(self.tz, other.tz):
503-
# TODO(2.0): remove this check. GH#37605
504-
warnings.warn(
505-
"Setitem-like behavior with mismatched timezones is deprecated "
506-
"and will change in a future version. Instead of raising "
507-
"(or for Index, Series, and DataFrame methods, coercing to "
508-
"object dtype), the value being set (or passed as a "
509-
"fill_value, or inserted) will be cast to the existing "
510-
"DatetimeArray/DatetimeIndex/Series/DataFrame column's "
511-
"timezone. To retain the old behavior, explicitly cast to "
512-
"object dtype before the operation.",
513-
FutureWarning,
514-
stacklevel=find_stack_level(),
515-
)
516-
raise ValueError(f"Timezones don't match. '{self.tz}' != '{other.tz}'")
517500

518501
# -----------------------------------------------------------------
519502
# Descriptive Properties
@@ -2030,6 +2013,11 @@ def _sequence_to_dt64ns(
20302013
copy = False
20312014
if lib.infer_dtype(data, skipna=False) == "integer":
20322015
data = data.astype(np.int64)
2016+
elif tz is not None and ambiguous == "raise":
2017+
# TODO: yearfirst/dayfirst/etc?
2018+
obj_data = np.asarray(data, dtype=object)
2019+
i8data = tslib.array_to_datetime_with_tz(obj_data, tz)
2020+
return i8data.view(DT64NS_DTYPE), tz, None
20332021
else:
20342022
# data comes back here as either i8 to denote UTC timestamps
20352023
# or M8[ns] to denote wall times

pandas/core/arrays/period.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -328,21 +328,20 @@ def _generate_range(cls, start, end, periods, freq, fields):
328328
def _unbox_scalar( # type: ignore[override]
329329
self,
330330
value: Period | NaTType,
331-
setitem: bool = False,
332331
) -> np.int64:
333332
if value is NaT:
334333
# error: Item "Period" of "Union[Period, NaTType]" has no attribute "value"
335334
return np.int64(value.value) # type: ignore[union-attr]
336335
elif isinstance(value, self._scalar_type):
337-
self._check_compatible_with(value, setitem=setitem)
336+
self._check_compatible_with(value)
338337
return np.int64(value.ordinal)
339338
else:
340339
raise ValueError(f"'value' should be a Period. Got '{value}' instead.")
341340

342341
def _scalar_from_string(self, value: str) -> Period:
343342
return Period(value, freq=self.freq)
344343

345-
def _check_compatible_with(self, other, setitem: bool = False) -> None:
344+
def _check_compatible_with(self, other) -> None:
346345
if other is NaT:
347346
return
348347
self._require_matching_freq(other)

pandas/core/arrays/timedeltas.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -292,10 +292,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None):
292292
# ----------------------------------------------------------------
293293
# DatetimeLike Interface
294294

295-
def _unbox_scalar(self, value, setitem: bool = False) -> np.timedelta64:
295+
def _unbox_scalar(self, value) -> np.timedelta64:
296296
if not isinstance(value, self._scalar_type) and value is not NaT:
297297
raise ValueError("'value' should be a Timedelta.")
298-
self._check_compatible_with(value, setitem=setitem)
298+
self._check_compatible_with(value)
299299
if value is NaT:
300300
return np.timedelta64(value.value, "ns")
301301
else:
@@ -304,7 +304,7 @@ def _unbox_scalar(self, value, setitem: bool = False) -> np.timedelta64:
304304
def _scalar_from_string(self, value) -> Timedelta | NaTType:
305305
return Timedelta(value)
306306

307-
def _check_compatible_with(self, other, setitem: bool = False) -> None:
307+
def _check_compatible_with(self, other) -> None:
308308
# we don't have anything to validate.
309309
pass
310310

pandas/tests/arrays/test_datetimelike.py

+10-37
Original file line numberDiff line numberDiff line change
@@ -292,19 +292,7 @@ def test_searchsorted(self):
292292
assert result == 10
293293

294294
@pytest.mark.parametrize("box", [None, "index", "series"])
295-
def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage):
296-
if isinstance(arr1d, DatetimeArray):
297-
tz = arr1d.tz
298-
ts1, ts2 = arr1d[1:3]
299-
if tz is not None and ts1.tz.tzname(ts1) != ts2.tz.tzname(ts2):
300-
# If we have e.g. tzutc(), when we cast to string and parse
301-
# back we get pytz.UTC, and then consider them different timezones
302-
# so incorrectly raise.
303-
mark = pytest.mark.xfail(
304-
raises=TypeError, reason="timezone comparisons inconsistent"
305-
)
306-
request.node.add_marker(mark)
307-
295+
def test_searchsorted_castable_strings(self, arr1d, box, string_storage):
308296
arr = arr1d
309297
if box is None:
310298
pass
@@ -461,19 +449,8 @@ def test_setitem_object_dtype(self, box, arr1d):
461449

462450
tm.assert_equal(arr1d, expected)
463451

464-
def test_setitem_strs(self, arr1d, request):
452+
def test_setitem_strs(self, arr1d):
465453
# Check that we parse strs in both scalar and listlike
466-
if isinstance(arr1d, DatetimeArray):
467-
tz = arr1d.tz
468-
ts1, ts2 = arr1d[-2:]
469-
if tz is not None and ts1.tz.tzname(ts1) != ts2.tz.tzname(ts2):
470-
# If we have e.g. tzutc(), when we cast to string and parse
471-
# back we get pytz.UTC, and then consider them different timezones
472-
# so incorrectly raise.
473-
mark = pytest.mark.xfail(
474-
raises=TypeError, reason="timezone comparisons inconsistent"
475-
)
476-
request.node.add_marker(mark)
477454

478455
# Setting list-like of strs
479456
expected = arr1d.copy()
@@ -852,18 +829,14 @@ def test_take_fill_valid(self, arr1d, fixed_now_ts):
852829
# GH#37356
853830
# Assuming here that arr1d fixture does not include Australia/Melbourne
854831
value = fixed_now_ts.tz_localize("Australia/Melbourne")
855-
msg = "Timezones don't match. .* != 'Australia/Melbourne'"
856-
with pytest.raises(ValueError, match=msg):
857-
# require tz match, not just tzawareness match
858-
with tm.assert_produces_warning(
859-
FutureWarning, match="mismatched timezone"
860-
):
861-
result = arr.take([-1, 1], allow_fill=True, fill_value=value)
862-
863-
# once deprecation is enforced
864-
# expected = arr.take([-1, 1], allow_fill=True,
865-
# fill_value=value.tz_convert(arr.dtype.tz))
866-
# tm.assert_equal(result, expected)
832+
result = arr.take([-1, 1], allow_fill=True, fill_value=value)
833+
834+
expected = arr.take(
835+
[-1, 1],
836+
allow_fill=True,
837+
fill_value=value.tz_convert(arr.dtype.tz),
838+
)
839+
tm.assert_equal(result, expected)
867840

868841
def test_concat_same_type_invalid(self, arr1d):
869842
# different timezones

pandas/tests/arrays/test_datetimes.py

+9-19
Original file line numberDiff line numberDiff line change
@@ -429,19 +429,16 @@ def test_setitem_str_impute_tz(self, tz_naive_fixture):
429429
tm.assert_equal(arr, expected)
430430

431431
def test_setitem_different_tz_raises(self):
432+
# pre-2.0 we required exact tz match, in 2.0 we require only
433+
# tzawareness-match
432434
data = np.array([1, 2, 3], dtype="M8[ns]")
433435
arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central"))
434436
with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"):
435437
arr[0] = pd.Timestamp("2000")
436438

437439
ts = pd.Timestamp("2000", tz="US/Eastern")
438-
with pytest.raises(ValueError, match="US/Central"):
439-
with tm.assert_produces_warning(
440-
FutureWarning, match="mismatched timezones"
441-
):
442-
arr[0] = ts
443-
# once deprecation is enforced
444-
# assert arr[0] == ts.tz_convert("US/Central")
440+
arr[0] = ts
441+
assert arr[0] == ts.tz_convert("US/Central")
445442

446443
def test_setitem_clears_freq(self):
447444
a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central"))
@@ -688,23 +685,16 @@ def test_shift_value_tzawareness_mismatch(self):
688685
dta.shift(1, fill_value=invalid)
689686

690687
def test_shift_requires_tzmatch(self):
691-
# since filling is setitem-like, we require a matching timezone,
692-
# not just matching tzawawreness
688+
# pre-2.0 we required exact tz match, in 2.0 we require just
689+
# matching tzawareness
693690
dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
694691
dta = dti._data
695692

696693
fill_value = pd.Timestamp("2020-10-18 18:44", tz="US/Pacific")
697694

698-
msg = "Timezones don't match. 'UTC' != 'US/Pacific'"
699-
with pytest.raises(ValueError, match=msg):
700-
with tm.assert_produces_warning(
701-
FutureWarning, match="mismatched timezones"
702-
):
703-
dta.shift(1, fill_value=fill_value)
704-
705-
# once deprecation is enforced
706-
# expected = dta.shift(1, fill_value=fill_value.tz_convert("UTC"))
707-
# tm.assert_equal(result, expected)
695+
result = dta.shift(1, fill_value=fill_value)
696+
expected = dta.shift(1, fill_value=fill_value.tz_convert("UTC"))
697+
tm.assert_equal(result, expected)
708698

709699
def test_tz_localize_t2d(self):
710700
dti = pd.date_range("1994-05-12", periods=12, tz="US/Pacific")

0 commit comments

Comments
 (0)