pandas-dev · mroeschke · Nov 3, 2022 · Oct 31, 2022 · Oct 31, 2022 · Oct 31, 2022
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -290,11 +290,11 @@ Removal of prior version deprecations/changes
 - Changed behavior of :class:`DataFrame` constructor when passed a ``dtype`` (other than int) that the data cannot be cast to; it now raises instead of silently ignoring the dtype (:issue:`41733`)
 - Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`)
 - Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)
+- Changed behavior of setitem-like operations (``__setitem__``, ``fillna``, ``where``, ``mask``, ``replace``, ``insert``, fill_value for ``shift``) on an object with :class:`DatetimeTZDtype` when using a value with a non-matching timezone, the value will be cast to the object's timezone instead of casting both to object-dtype (:issue:`44243`)
 - Removed the deprecated ``base`` and ``loffset`` arguments from :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample` and :class:`pandas.Grouper`. Use ``offset`` or ``origin`` instead (:issue:`31809`)
 - Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`)
 - Changed behavior of comparison of a :class:`Timestamp` with a ``datetime.date`` object; these now compare as un-equal and raise on inequality comparisons, matching the ``datetime.datetime`` behavior (:issue:`36131`)
 - Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`)
--
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.performance:
@@ -355,7 +355,7 @@ Timedelta
 
 Timezones
 ^^^^^^^^^
--
+- Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` with object-dtype containing multiple timezone-aware ``datetime`` objects with heterogeneous timezones to a :class:`DatetimeTZDtype` incorrectly raising (:issue:`32581`)
 -
 
 Numeric

diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi
@@ -28,3 +28,7 @@ def array_to_datetime(
 ) -> tuple[np.ndarray, tzinfo | None]: ...
 
 # returned ndarray may be object dtype or datetime64[ns]
+
+def array_to_datetime_with_tz(
+    values: npt.NDArray[np.object_], tz: tzinfo
+) -> npt.NDArray[np.int64]: ...
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -6,6 +6,7 @@ from cpython.datetime cimport (
     import_datetime,
     tzinfo,
 )
+from cpython.object cimport PyObject
 
 # import datetime C API
 import_datetime()
@@ -862,3 +863,50 @@ cdef inline bint _parse_today_now(str val, int64_t* iresult, bint utc):
         iresult[0] = Timestamp.today().value
         return True
     return False
+
+
+def array_to_datetime_with_tz(ndarray values, tzinfo tz):
+    """
+    Vectorized analogue to pd.Timestamp(value, tz=tz)
+
+    values has object-dtype, unrestricted ndim.
+
+    Major differences between this and array_to_datetime with utc=True
+        - np.datetime64 objects are treated as _wall_ times.
+        - tznaive datetimes are treated as _wall_ times.
+    """
+    cdef:
+        ndarray result = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_INT64, 0)
+        cnp.broadcast mi = cnp.PyArray_MultiIterNew2(result, values)
+        Py_ssize_t i, n = values.size
+        object item
+        int64_t ival
+        datetime ts
+
+    for i in range(n):
+        # Analogous to `item = values[i]`
+        item = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
+
+        if checknull_with_nat_and_na(item):
+            # this catches pd.NA which would raise in the Timestamp constructor
+            ival = NPY_NAT
+
+        else:
+            ts = Timestamp(item)
+            if ts is NaT:
+                ival = NPY_NAT
+            else:
+                if ts.tz is not None:
+                    ts = ts.tz_convert(tz)
+                else:
+                    # datetime64, tznaive pydatetime, int, float
+                    ts = ts.tz_localize(tz)
+                ts = ts._as_unit("ns")
+                ival = ts.value
+
+        # Analogous to: result[i] = ival
+        (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = ival
+
+        cnp.PyArray_MultiIter_NEXT(mi)
+
+    return result
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -217,7 +217,7 @@ def _scalar_from_string(self, value: str) -> DTScalarOrNaT:
         raise AbstractMethodError(self)
 
     def _unbox_scalar(
-        self, value: DTScalarOrNaT, setitem: bool = False
+        self, value: DTScalarOrNaT
     ) -> np.int64 | np.datetime64 | np.timedelta64:
         """
         Unbox the integer value of a scalar `value`.
@@ -226,8 +226,6 @@ def _unbox_scalar(
         ----------
         value : Period, Timestamp, Timedelta, or NaT
             Depending on subclass.
-        setitem : bool, default False
-            Whether to check compatibility with setitem strictness.
 
         Returns
         -------
@@ -240,9 +238,7 @@ def _unbox_scalar(
         """
         raise AbstractMethodError(self)
 
-    def _check_compatible_with(
-        self, other: DTScalarOrNaT, setitem: bool = False
-    ) -> None:
+    def _check_compatible_with(self, other: DTScalarOrNaT) -> None:
         """
         Verify that `self` and `other` are compatible.
 
@@ -255,9 +251,6 @@ def _check_compatible_with(
         Parameters
         ----------
         other
-        setitem : bool, default False
-            For __setitem__ we may have stricter compatibility restrictions than
-            for comparisons.
 
         Raises
         ------
@@ -663,7 +656,7 @@ def _validate_scalar(
             #  this option exists to prevent a performance hit in
             #  TimedeltaIndex.get_loc
             return value
-        return self._unbox_scalar(value, setitem=setitem)
+        return self._unbox_scalar(value)
 
     def _validation_error_message(self, value, allow_listlike: bool = False) -> str:
         """
@@ -757,19 +750,18 @@ def _validate_setitem_value(self, value):
         else:
             return self._validate_scalar(value, allow_listlike=True)
 
-        return self._unbox(value, setitem=True)
+        return self._unbox(value)
 
-    def _unbox(
-        self, other, setitem: bool = False
-    ) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarray:
+    @final
+    def _unbox(self, other) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarray:
         """
         Unbox either a scalar with _unbox_scalar or an instance of our own type.
         """
         if lib.is_scalar(other):
-            other = self._unbox_scalar(other, setitem=setitem)
+            other = self._unbox_scalar(other)
         else:
             # same type as self
-            self._check_compatible_with(other, setitem=setitem)
+            self._check_compatible_with(other)
             other = other._ndarray
         return other
 

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -480,36 +480,19 @@ def _generate_range(  # type: ignore[override]
     # -----------------------------------------------------------------
     # DatetimeLike Interface
 
-    def _unbox_scalar(self, value, setitem: bool = False) -> np.datetime64:
+    def _unbox_scalar(self, value) -> np.datetime64:
         if not isinstance(value, self._scalar_type) and value is not NaT:
             raise ValueError("'value' should be a Timestamp.")
-        self._check_compatible_with(value, setitem=setitem)
+        self._check_compatible_with(value)
         return value.asm8
 
     def _scalar_from_string(self, value) -> Timestamp | NaTType:
         return Timestamp(value, tz=self.tz)
 
-    def _check_compatible_with(self, other, setitem: bool = False):
+    def _check_compatible_with(self, other) -> None:
         if other is NaT:
             return
         self._assert_tzawareness_compat(other)
-        if setitem:
-            # Stricter check for setitem vs comparison methods
-            if self.tz is not None and not timezones.tz_compare(self.tz, other.tz):
-                # TODO(2.0): remove this check. GH#37605
-                warnings.warn(
-                    "Setitem-like behavior with mismatched timezones is deprecated "
-                    "and will change in a future version. Instead of raising "
-                    "(or for Index, Series, and DataFrame methods, coercing to "
-                    "object dtype), the value being set (or passed as a "
-                    "fill_value, or inserted) will be cast to the existing "
-                    "DatetimeArray/DatetimeIndex/Series/DataFrame column's "
-                    "timezone. To retain the old behavior, explicitly cast to "
-                    "object dtype before the operation.",
-                    FutureWarning,
-                    stacklevel=find_stack_level(),
-                )
-                raise ValueError(f"Timezones don't match. '{self.tz}' != '{other.tz}'")
 
     # -----------------------------------------------------------------
     # Descriptive Properties
@@ -2029,6 +2012,11 @@ def _sequence_to_dt64ns(
         copy = False
         if lib.infer_dtype(data, skipna=False) == "integer":
             data = data.astype(np.int64)
+        elif tz is not None and ambiguous == "raise":
+            # TODO: yearfirst/dayfirst/etc?
+            obj_data = np.asarray(data, dtype=object)
+            i8data = tslib.array_to_datetime_with_tz(obj_data, tz)
+            return i8data.view(DT64NS_DTYPE), tz, None
         else:
             # data comes back here as either i8 to denote UTC timestamps
             #  or M8[ns] to denote wall times

diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -328,21 +328,20 @@ def _generate_range(cls, start, end, periods, freq, fields):
     def _unbox_scalar(  # type: ignore[override]
         self,
         value: Period | NaTType,
-        setitem: bool = False,
     ) -> np.int64:
         if value is NaT:
             # error: Item "Period" of "Union[Period, NaTType]" has no attribute "value"
             return np.int64(value.value)  # type: ignore[union-attr]
         elif isinstance(value, self._scalar_type):
-            self._check_compatible_with(value, setitem=setitem)
+            self._check_compatible_with(value)
             return np.int64(value.ordinal)
         else:
             raise ValueError(f"'value' should be a Period. Got '{value}' instead.")
 
     def _scalar_from_string(self, value: str) -> Period:
         return Period(value, freq=self.freq)
 
-    def _check_compatible_with(self, other, setitem: bool = False) -> None:
+    def _check_compatible_with(self, other) -> None:
         if other is NaT:
             return
         self._require_matching_freq(other)

diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -288,10 +288,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None):
     # ----------------------------------------------------------------
     # DatetimeLike Interface
 
-    def _unbox_scalar(self, value, setitem: bool = False) -> np.timedelta64:
+    def _unbox_scalar(self, value) -> np.timedelta64:
         if not isinstance(value, self._scalar_type) and value is not NaT:
             raise ValueError("'value' should be a Timedelta.")
-        self._check_compatible_with(value, setitem=setitem)
+        self._check_compatible_with(value)
         if value is NaT:
             return np.timedelta64(value.value, "ns")
         else:
@@ -301,7 +301,7 @@ def _unbox_scalar(self, value, setitem: bool = False) -> np.timedelta64:
     def _scalar_from_string(self, value) -> Timedelta | NaTType:
         return Timedelta(value)
 
-    def _check_compatible_with(self, other, setitem: bool = False) -> None:
+    def _check_compatible_with(self, other) -> None:
         # we don't have anything to validate.
         pass
 

diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
@@ -292,19 +292,7 @@ def test_searchsorted(self):
         assert result == 10
 
     @pytest.mark.parametrize("box", [None, "index", "series"])
-    def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage):
-        if isinstance(arr1d, DatetimeArray):
-            tz = arr1d.tz
-            ts1, ts2 = arr1d[1:3]
-            if tz is not None and ts1.tz.tzname(ts1) != ts2.tz.tzname(ts2):
-                # If we have e.g. tzutc(), when we cast to string and parse
-                #  back we get pytz.UTC, and then consider them different timezones
-                #  so incorrectly raise.
-                mark = pytest.mark.xfail(
-                    raises=TypeError, reason="timezone comparisons inconsistent"
-                )
-                request.node.add_marker(mark)
-
+    def test_searchsorted_castable_strings(self, arr1d, box, string_storage):
         arr = arr1d
         if box is None:
             pass
@@ -461,19 +449,8 @@ def test_setitem_object_dtype(self, box, arr1d):
 
         tm.assert_equal(arr1d, expected)
 
-    def test_setitem_strs(self, arr1d, request):
+    def test_setitem_strs(self, arr1d):
         # Check that we parse strs in both scalar and listlike
-        if isinstance(arr1d, DatetimeArray):
-            tz = arr1d.tz
-            ts1, ts2 = arr1d[-2:]
-            if tz is not None and ts1.tz.tzname(ts1) != ts2.tz.tzname(ts2):
-                # If we have e.g. tzutc(), when we cast to string and parse
-                #  back we get pytz.UTC, and then consider them different timezones
-                #  so incorrectly raise.
-                mark = pytest.mark.xfail(
-                    raises=TypeError, reason="timezone comparisons inconsistent"
-                )
-                request.node.add_marker(mark)
 
         # Setting list-like of strs
         expected = arr1d.copy()
@@ -852,18 +829,14 @@ def test_take_fill_valid(self, arr1d, fixed_now_ts):
             # GH#37356
             # Assuming here that arr1d fixture does not include Australia/Melbourne
             value = fixed_now_ts.tz_localize("Australia/Melbourne")
-            msg = "Timezones don't match. .* != 'Australia/Melbourne'"
-            with pytest.raises(ValueError, match=msg):
-                # require tz match, not just tzawareness match
-                with tm.assert_produces_warning(
-                    FutureWarning, match="mismatched timezone"
-                ):
-                    result = arr.take([-1, 1], allow_fill=True, fill_value=value)
-
-            # once deprecation is enforced
-            # expected = arr.take([-1, 1], allow_fill=True,
-            #  fill_value=value.tz_convert(arr.dtype.tz))
-            # tm.assert_equal(result, expected)
+            result = arr.take([-1, 1], allow_fill=True, fill_value=value)
+
+            expected = arr.take(
+                [-1, 1],
+                allow_fill=True,
+                fill_value=value.tz_convert(arr.dtype.tz),
+            )
+            tm.assert_equal(result, expected)
 
     def test_concat_same_type_invalid(self, arr1d):
         # different timezones

diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py
@@ -429,19 +429,16 @@ def test_setitem_str_impute_tz(self, tz_naive_fixture):
         tm.assert_equal(arr, expected)
 
     def test_setitem_different_tz_raises(self):
+        # pre-2.0 we required exact tz match, in 2.0 we require only
+        #  tzawareness-match
         data = np.array([1, 2, 3], dtype="M8[ns]")
         arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central"))
         with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"):
             arr[0] = pd.Timestamp("2000")
 
         ts = pd.Timestamp("2000", tz="US/Eastern")
-        with pytest.raises(ValueError, match="US/Central"):
-            with tm.assert_produces_warning(
-                FutureWarning, match="mismatched timezones"
-            ):
-                arr[0] = ts
-        # once deprecation is enforced
-        # assert arr[0] == ts.tz_convert("US/Central")
+        arr[0] = ts
+        assert arr[0] == ts.tz_convert("US/Central")
 
     def test_setitem_clears_freq(self):
         a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central"))
@@ -688,23 +685,16 @@ def test_shift_value_tzawareness_mismatch(self):
                 dta.shift(1, fill_value=invalid)
 
     def test_shift_requires_tzmatch(self):
-        # since filling is setitem-like, we require a matching timezone,
-        #  not just matching tzawawreness
+        # pre-2.0 we required exact tz match, in 2.0 we require just
+        #  matching tzawareness
         dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
         dta = dti._data
 
         fill_value = pd.Timestamp("2020-10-18 18:44", tz="US/Pacific")
 
-        msg = "Timezones don't match. 'UTC' != 'US/Pacific'"
-        with pytest.raises(ValueError, match=msg):
-            with tm.assert_produces_warning(
-                FutureWarning, match="mismatched timezones"
-            ):
-                dta.shift(1, fill_value=fill_value)
-
-        # once deprecation is enforced
-        # expected = dta.shift(1, fill_value=fill_value.tz_convert("UTC"))
-        # tm.assert_equal(result, expected)
+        result = dta.shift(1, fill_value=fill_value)
+        expected = dta.shift(1, fill_value=fill_value.tz_convert("UTC"))
+        tm.assert_equal(result, expected)
 
     def test_tz_localize_t2d(self):
         dti = pd.date_range("1994-05-12", periods=12, tz="US/Pacific")