pandas-dev · jreback · Oct 1, 2018 · Sep 24, 2018 · Sep 25, 2018 · Sep 25, 2018
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -681,6 +681,7 @@ Timezones
 - Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`)
 - Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`)
 - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`)
+- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`)
 
 Offsets
 ^^^^^^^

diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -277,7 +277,7 @@ def _evaluate_compare(self, other, op):
         except TypeError:
             return result
 
-    def _ensure_localized(self, result, ambiguous='raise'):
+    def _ensure_localized(self, result, ambiguous='raise', from_utc=False):
         """
         ensure that we are re-localized
 
@@ -289,6 +289,7 @@ def _ensure_localized(self, result, ambiguous='raise'):
         result : DatetimeIndex / i8 ndarray
         ambiguous : str, bool, or bool-ndarray
             default 'raise'
+        from_utc : bool
 
         Returns
         -------
@@ -299,7 +300,10 @@ def _ensure_localized(self, result, ambiguous='raise'):
         if getattr(self, 'tz', None) is not None:
             if not isinstance(result, ABCIndexClass):
                 result = self._simple_new(result)
-            result = result.tz_localize(self.tz, ambiguous=ambiguous)
+            if from_utc:
+                result = result.tz_localize('UTC').tz_convert(self.tz)
+            else:
+                result = result.tz_localize(self.tz, ambiguous=ambiguous)
         return result
 
     def _box_values_as_index(self):
@@ -622,11 +626,11 @@ def repeat(self, repeats, *args, **kwargs):
 
     @Appender(_index_shared_docs['where'] % _index_doc_kwargs)
     def where(self, cond, other=None):
-        other = _ensure_datetimelike_to_i8(other)
-        values = _ensure_datetimelike_to_i8(self)
+        other = _ensure_datetimelike_to_i8(other, to_utc=True)
+        values = _ensure_datetimelike_to_i8(self, to_utc=True)
         result = np.where(cond, values, other).astype('i8')
 
-        result = self._ensure_localized(result)
+        result = self._ensure_localized(result, from_utc=True)
         return self._shallow_copy(result,
                                   **self._get_attributes_dict())
 
@@ -695,14 +699,17 @@ def astype(self, dtype, copy=True):
         return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)
 
 
-def _ensure_datetimelike_to_i8(other):
+def _ensure_datetimelike_to_i8(other, to_utc=False):
     """ helper for coercing an input scalar or array to i8 """
     if is_scalar(other) and isna(other):
         other = iNaT
     elif isinstance(other, ABCIndexClass):
         # convert tz if needed
         if getattr(other, 'tz', None) is not None:
-            other = other.tz_localize(None).asi8
+            if to_utc:
+                other = other.tz_convert('UTC').asi8
+            else:
+                other = other.tz_localize(None).asi8
         else:
             other = other.asi8
     else:

diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py
@@ -590,11 +590,9 @@ def test_where_series_datetime64(self, fill_val, exp_dtype):
                          pd.Timestamp('2011-01-03'), values[3]])
         self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
 
-    @pytest.mark.parametrize("fill_val,exp_dtype", [
-        (pd.Timestamp('2012-01-01'), 'datetime64[ns]'),
-        (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)],
-        ids=['datetime64', 'datetime64tz'])
-    def test_where_index_datetime(self, fill_val, exp_dtype):
+    def test_where_index_datetime(self):
+        fill_val = pd.Timestamp('2012-01-01')
+        exp_dtype = 'datetime64[ns]'
         obj = pd.Index([pd.Timestamp('2011-01-01'),
                         pd.Timestamp('2011-01-02'),
                         pd.Timestamp('2011-01-03'),
@@ -613,13 +611,32 @@ def test_where_index_datetime(self, fill_val, exp_dtype):
                         pd.Timestamp('2011-01-03'),
                         pd.Timestamp('2012-01-04')])
 
-        if fill_val.tz:
-            self._assert_where_conversion(obj, cond, values, exp,
-                                          'datetime64[ns]')
-            pytest.xfail("ToDo: do not ignore timezone, must be object")
         self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
-        pytest.xfail("datetime64 + datetime64 -> datetime64 must support"
-                     " scalar")
+
+    @pytest.mark.xfail(reason="ToDo: do not ignore timezone, must be object")
+    def test_where_index_datetimetz(self):
+        fill_val = pd.Timestamp('2012-01-01', tz='US/Eastern')
+        exp_dtype = np.object
+        obj = pd.Index([pd.Timestamp('2011-01-01'),
+                        pd.Timestamp('2011-01-02'),
+                        pd.Timestamp('2011-01-03'),
+                        pd.Timestamp('2011-01-04')])
+        assert obj.dtype == 'datetime64[ns]'
+        cond = pd.Index([True, False, True, False])
+
+        msg = ("Index\\(\\.\\.\\.\\) must be called with a collection "
+               "of some kind")
+        with tm.assert_raises_regex(TypeError, msg):
+            obj.where(cond, fill_val)
+
+        values = pd.Index(pd.date_range(fill_val, periods=4))
+        exp = pd.Index([pd.Timestamp('2011-01-01'),
+                        pd.Timestamp('2012-01-02', tz='US/Eastern'),
+                        pd.Timestamp('2011-01-03'),
+                        pd.Timestamp('2012-01-04', tz='US/Eastern')],
+                       dtype=exp_dtype)
+
+        self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
 
     def test_where_index_complex128(self):
         pass

diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -601,6 +601,30 @@ def test_merge_on_datetime64tz(self):
         assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]'
         assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]'
 
+    def test_merge_datetime64tz_with_dst_transition(self):
+        # GH 18885
+        df1 = pd.DataFrame(pd.date_range(
+            '2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'),
+            columns=['date'])
+        df1['value'] = 1
+        df2 = pd.DataFrame([
+            pd.to_datetime('2017-10-29 03:00:00'),
+            pd.to_datetime('2017-10-29 04:00:00'),
+            pd.to_datetime('2017-10-29 05:00:00')
+        ],
+            columns=['date'])
+        df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert(
+            'Europe/Madrid')
+        df2['value'] = 2
+        result = pd.merge(df1, df2, how='outer', on='date')
+        expected = pd.DataFrame({
+            'date': pd.date_range(
+                '2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'),
+            'value_x': [1] * 4 + [np.nan] * 3,
+            'value_y': [np.nan] * 4 + [2] * 3
+        })
+        assert_frame_equal(result, expected)
+
     def test_merge_non_unique_period_index(self):
         # GH #16871
         index = pd.period_range('2016-01-01', periods=16, freq='M')