Skip to content

Commit a277e4a

Browse files
mroeschkejreback
authored andcommitted
BUG: Merge timezone aware data with DST (#22825)
1 parent f021bbc commit a277e4a

File tree

4 files changed

+93
-31
lines changed

4 files changed

+93
-31
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -815,6 +815,7 @@ Reshaping
815815
- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`)
816816
- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`)
817817
- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`)
818+
- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`)
818819

819820
Build Changes
820821
^^^^^^^^^^^^^

pandas/core/indexes/datetimelike.py

+39-20
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ def _evaluate_compare(self, other, op):
277277
except TypeError:
278278
return result
279279

280-
def _ensure_localized(self, result, ambiguous='raise'):
280+
def _ensure_localized(self, arg, ambiguous='raise', from_utc=False):
281281
"""
282282
ensure that we are re-localized
283283
@@ -286,9 +286,11 @@ def _ensure_localized(self, result, ambiguous='raise'):
286286
287287
Parameters
288288
----------
289-
result : DatetimeIndex / i8 ndarray
290-
ambiguous : str, bool, or bool-ndarray
291-
default 'raise'
289+
arg : DatetimeIndex / i8 ndarray
290+
ambiguous : str, bool, or bool-ndarray, default 'raise'
291+
from_utc : bool, default False
292+
If True, localize the i8 ndarray to UTC first before converting to
293+
the appropriate tz. If False, localize directly to the tz.
292294
293295
Returns
294296
-------
@@ -297,10 +299,13 @@ def _ensure_localized(self, result, ambiguous='raise'):
297299

298300
# reconvert to local tz
299301
if getattr(self, 'tz', None) is not None:
300-
if not isinstance(result, ABCIndexClass):
301-
result = self._simple_new(result)
302-
result = result.tz_localize(self.tz, ambiguous=ambiguous)
303-
return result
302+
if not isinstance(arg, ABCIndexClass):
303+
arg = self._simple_new(arg)
304+
if from_utc:
305+
arg = arg.tz_localize('UTC').tz_convert(self.tz)
306+
else:
307+
arg = arg.tz_localize(self.tz, ambiguous=ambiguous)
308+
return arg
304309

305310
def _box_values_as_index(self):
306311
"""
@@ -622,11 +627,11 @@ def repeat(self, repeats, *args, **kwargs):
622627

623628
@Appender(_index_shared_docs['where'] % _index_doc_kwargs)
624629
def where(self, cond, other=None):
625-
other = _ensure_datetimelike_to_i8(other)
626-
values = _ensure_datetimelike_to_i8(self)
630+
other = _ensure_datetimelike_to_i8(other, to_utc=True)
631+
values = _ensure_datetimelike_to_i8(self, to_utc=True)
627632
result = np.where(cond, values, other).astype('i8')
628633

629-
result = self._ensure_localized(result)
634+
result = self._ensure_localized(result, from_utc=True)
630635
return self._shallow_copy(result,
631636
**self._get_attributes_dict())
632637

@@ -695,23 +700,37 @@ def astype(self, dtype, copy=True):
695700
return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)
696701

697702

698-
def _ensure_datetimelike_to_i8(other):
699-
""" helper for coercing an input scalar or array to i8 """
703+
def _ensure_datetimelike_to_i8(other, to_utc=False):
704+
"""
705+
helper for coercing an input scalar or array to i8
706+
707+
Parameters
708+
----------
709+
other : 1d array
710+
to_utc : bool, default False
711+
If True, convert the values to UTC before extracting the i8 values
712+
If False, extract the i8 values directly.
713+
714+
Returns
715+
-------
716+
i8 1d array
717+
"""
700718
if is_scalar(other) and isna(other):
701-
other = iNaT
719+
return iNaT
702720
elif isinstance(other, ABCIndexClass):
703721
# convert tz if needed
704722
if getattr(other, 'tz', None) is not None:
705-
other = other.tz_localize(None).asi8
706-
else:
707-
other = other.asi8
723+
if to_utc:
724+
other = other.tz_convert('UTC')
725+
else:
726+
other = other.tz_localize(None)
708727
else:
709728
try:
710-
other = np.array(other, copy=False).view('i8')
729+
return np.array(other, copy=False).view('i8')
711730
except TypeError:
712731
# period array cannot be coerces to int
713-
other = Index(other).asi8
714-
return other
732+
other = Index(other)
733+
return other.asi8
715734

716735

717736
def wrap_arithmetic_op(self, other, result):

pandas/tests/indexing/test_coercion.py

+29-11
Original file line numberDiff line numberDiff line change
@@ -590,11 +590,9 @@ def test_where_series_datetime64(self, fill_val, exp_dtype):
590590
pd.Timestamp('2011-01-03'), values[3]])
591591
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
592592

593-
@pytest.mark.parametrize("fill_val,exp_dtype", [
594-
(pd.Timestamp('2012-01-01'), 'datetime64[ns]'),
595-
(pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)],
596-
ids=['datetime64', 'datetime64tz'])
597-
def test_where_index_datetime(self, fill_val, exp_dtype):
593+
def test_where_index_datetime(self):
594+
fill_val = pd.Timestamp('2012-01-01')
595+
exp_dtype = 'datetime64[ns]'
598596
obj = pd.Index([pd.Timestamp('2011-01-01'),
599597
pd.Timestamp('2011-01-02'),
600598
pd.Timestamp('2011-01-03'),
@@ -613,13 +611,33 @@ def test_where_index_datetime(self, fill_val, exp_dtype):
613611
pd.Timestamp('2011-01-03'),
614612
pd.Timestamp('2012-01-04')])
615613

616-
if fill_val.tz:
617-
self._assert_where_conversion(obj, cond, values, exp,
618-
'datetime64[ns]')
619-
pytest.xfail("ToDo: do not ignore timezone, must be object")
620614
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
621-
pytest.xfail("datetime64 + datetime64 -> datetime64 must support"
622-
" scalar")
615+
616+
@pytest.mark.xfail(
617+
reason="GH 22839: do not ignore timezone, must be object")
618+
def test_where_index_datetimetz(self):
619+
fill_val = pd.Timestamp('2012-01-01', tz='US/Eastern')
620+
exp_dtype = np.object
621+
obj = pd.Index([pd.Timestamp('2011-01-01'),
622+
pd.Timestamp('2011-01-02'),
623+
pd.Timestamp('2011-01-03'),
624+
pd.Timestamp('2011-01-04')])
625+
assert obj.dtype == 'datetime64[ns]'
626+
cond = pd.Index([True, False, True, False])
627+
628+
msg = ("Index\\(\\.\\.\\.\\) must be called with a collection "
629+
"of some kind")
630+
with tm.assert_raises_regex(TypeError, msg):
631+
obj.where(cond, fill_val)
632+
633+
values = pd.Index(pd.date_range(fill_val, periods=4))
634+
exp = pd.Index([pd.Timestamp('2011-01-01'),
635+
pd.Timestamp('2012-01-02', tz='US/Eastern'),
636+
pd.Timestamp('2011-01-03'),
637+
pd.Timestamp('2012-01-04', tz='US/Eastern')],
638+
dtype=exp_dtype)
639+
640+
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
623641

624642
def test_where_index_complex128(self):
625643
pass

pandas/tests/reshape/merge/test_merge.py

+24
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,30 @@ def test_merge_on_datetime64tz(self):
601601
assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]'
602602
assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]'
603603

604+
def test_merge_datetime64tz_with_dst_transition(self):
605+
# GH 18885
606+
df1 = pd.DataFrame(pd.date_range(
607+
'2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'),
608+
columns=['date'])
609+
df1['value'] = 1
610+
df2 = pd.DataFrame({
611+
'date': pd.to_datetime([
612+
'2017-10-29 03:00:00', '2017-10-29 04:00:00',
613+
'2017-10-29 05:00:00'
614+
]),
615+
'value': 2
616+
})
617+
df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert(
618+
'Europe/Madrid')
619+
result = pd.merge(df1, df2, how='outer', on='date')
620+
expected = pd.DataFrame({
621+
'date': pd.date_range(
622+
'2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'),
623+
'value_x': [1] * 4 + [np.nan] * 3,
624+
'value_y': [np.nan] * 4 + [2] * 3
625+
})
626+
assert_frame_equal(result, expected)
627+
604628
def test_merge_non_unique_period_index(self):
605629
# GH #16871
606630
index = pd.period_range('2016-01-01', periods=16, freq='M')

0 commit comments

Comments
 (0)