Skip to content

BUG: Merge timezone aware data with DST #22825

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Oct 1, 2018
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,7 @@ Timezones
- Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`)
- Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`)
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`)
- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you move to reshaping


Offsets
^^^^^^^
Expand Down
21 changes: 14 additions & 7 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def _evaluate_compare(self, other, op):
except TypeError:
return result

def _ensure_localized(self, result, ambiguous='raise'):
def _ensure_localized(self, result, ambiguous='raise', from_utc=False):
"""
ensure that we are re-localized

Expand All @@ -289,6 +289,7 @@ def _ensure_localized(self, result, ambiguous='raise'):
result : DatetimeIndex / i8 ndarray
ambiguous : str, bool, or bool-ndarray
default 'raise'
from_utc : bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add the default here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add what this actually means (and similarly for to_utc below).

also maybe rename 'result' to array or input or something, I think these are holdover names and are confusing.


Returns
-------
Expand All @@ -299,7 +300,10 @@ def _ensure_localized(self, result, ambiguous='raise'):
if getattr(self, 'tz', None) is not None:
if not isinstance(result, ABCIndexClass):
result = self._simple_new(result)
result = result.tz_localize(self.tz, ambiguous=ambiguous)
if from_utc:
result = result.tz_localize('UTC').tz_convert(self.tz)
else:
result = result.tz_localize(self.tz, ambiguous=ambiguous)
return result

def _box_values_as_index(self):
Expand Down Expand Up @@ -622,11 +626,11 @@ def repeat(self, repeats, *args, **kwargs):

@Appender(_index_shared_docs['where'] % _index_doc_kwargs)
def where(self, cond, other=None):
other = _ensure_datetimelike_to_i8(other)
values = _ensure_datetimelike_to_i8(self)
other = _ensure_datetimelike_to_i8(other, to_utc=True)
values = _ensure_datetimelike_to_i8(self, to_utc=True)
result = np.where(cond, values, other).astype('i8')

result = self._ensure_localized(result)
result = self._ensure_localized(result, from_utc=True)
return self._shallow_copy(result,
**self._get_attributes_dict())

Expand Down Expand Up @@ -695,14 +699,17 @@ def astype(self, dtype, copy=True):
return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)


def _ensure_datetimelike_to_i8(other):
def _ensure_datetimelike_to_i8(other, to_utc=False):
""" helper for coercing an input scalar or array to i8 """
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string here

if is_scalar(other) and isna(other):
other = iNaT
elif isinstance(other, ABCIndexClass):
# convert tz if needed
if getattr(other, 'tz', None) is not None:
other = other.tz_localize(None).asi8
if to_utc:
other = other.tz_convert('UTC').asi8
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you do the .asi8 lower? (and just do it on other) to avoid repeating code

else:
other = other.tz_localize(None).asi8
else:
other = other.asi8
else:
Expand Down
39 changes: 28 additions & 11 deletions pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,11 +590,9 @@ def test_where_series_datetime64(self, fill_val, exp_dtype):
pd.Timestamp('2011-01-03'), values[3]])
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)

@pytest.mark.parametrize("fill_val,exp_dtype", [
(pd.Timestamp('2012-01-01'), 'datetime64[ns]'),
(pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)],
ids=['datetime64', 'datetime64tz'])
def test_where_index_datetime(self, fill_val, exp_dtype):
def test_where_index_datetime(self):
fill_val = pd.Timestamp('2012-01-01')
exp_dtype = 'datetime64[ns]'
obj = pd.Index([pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03'),
Expand All @@ -613,13 +611,32 @@ def test_where_index_datetime(self, fill_val, exp_dtype):
pd.Timestamp('2011-01-03'),
pd.Timestamp('2012-01-04')])

if fill_val.tz:
self._assert_where_conversion(obj, cond, values, exp,
'datetime64[ns]')
pytest.xfail("ToDo: do not ignore timezone, must be object")
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
pytest.xfail("datetime64 + datetime64 -> datetime64 must support"
" scalar")

@pytest.mark.xfail(reason="ToDo: do not ignore timezone, must be object")
def test_where_index_datetimetz(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was bizarrely structured above. It was given the wrong arguments for everything to pass before it hit the xfail...

Good news is that one of the cases is passing, so I am splitting the tz-aware and tz-naive case and properly xfailing the tz-aware case (which was already failing on master and is orthogonal to this change)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there an issue for this case? if not can you create one and reference it here

fill_val = pd.Timestamp('2012-01-01', tz='US/Eastern')
exp_dtype = np.object
obj = pd.Index([pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03'),
pd.Timestamp('2011-01-04')])
assert obj.dtype == 'datetime64[ns]'
cond = pd.Index([True, False, True, False])

msg = ("Index\\(\\.\\.\\.\\) must be called with a collection "
"of some kind")
with tm.assert_raises_regex(TypeError, msg):
obj.where(cond, fill_val)

values = pd.Index(pd.date_range(fill_val, periods=4))
exp = pd.Index([pd.Timestamp('2011-01-01'),
pd.Timestamp('2012-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-03'),
pd.Timestamp('2012-01-04', tz='US/Eastern')],
dtype=exp_dtype)

self._assert_where_conversion(obj, cond, values, exp, exp_dtype)

def test_where_index_complex128(self):
pass
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,30 @@ def test_merge_on_datetime64tz(self):
assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]'
assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]'

def test_merge_datetime64tz_with_dst_transition(self):
# GH 18885
df1 = pd.DataFrame(pd.date_range(
'2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'),
columns=['date'])
df1['value'] = 1
df2 = pd.DataFrame([
pd.to_datetime('2017-10-29 03:00:00'),
pd.to_datetime('2017-10-29 04:00:00'),
pd.to_datetime('2017-10-29 05:00:00')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you create this one as well with date_range to shorten it a bit? (and if not, I think it is more typical to pass a list of strings to to_datetime instead of calling it on each string)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can maybe also directly include the 'value' column in the constructor

],
columns=['date'])
df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert(
'Europe/Madrid')
df2['value'] = 2
result = pd.merge(df1, df2, how='outer', on='date')
expected = pd.DataFrame({
'date': pd.date_range(
'2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'),
'value_x': [1] * 4 + [np.nan] * 3,
'value_y': [np.nan] * 4 + [2] * 3
})
assert_frame_equal(result, expected)

def test_merge_non_unique_period_index(self):
# GH #16871
index = pd.period_range('2016-01-01', periods=16, freq='M')
Expand Down