From ce784e005f241868d1181afce9e172c1f391d595 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 24 Sep 2018 16:40:58 -0700 Subject: [PATCH 1/5] BUG: Merge timezone aware data with DST --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/indexes/datetimelike.py | 21 ++++++++++++++------- pandas/tests/reshape/merge/test_merge.py | 24 ++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 6c91b6374b8af..346784c924b6d 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -681,6 +681,7 @@ Timezones - Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) - Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) +- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) Offsets ^^^^^^^ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 578167a7db500..7093fcb51ebcb 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -277,7 +277,7 @@ def _evaluate_compare(self, other, op): except TypeError: return result - def _ensure_localized(self, result, ambiguous='raise'): + def _ensure_localized(self, result, ambiguous='raise', from_utc=False): """ ensure that we are re-localized @@ -289,6 +289,7 @@ def _ensure_localized(self, result, ambiguous='raise'): result : DatetimeIndex / i8 ndarray ambiguous : str, bool, or bool-ndarray default 'raise' + from_utc : bool Returns ------- @@ -299,7 +300,10 @@ def _ensure_localized(self, result, ambiguous='raise'): if getattr(self, 'tz', None) is not None: if not isinstance(result, ABCIndexClass): result = self._simple_new(result) - result = result.tz_localize(self.tz, ambiguous=ambiguous) + if from_utc: + result = result.tz_localize('UTC').tz_convert(self.tz) + else: + result = result.tz_localize(self.tz, ambiguous=ambiguous) return result def _box_values_as_index(self): @@ -622,11 +626,11 @@ def repeat(self, repeats, *args, **kwargs): @Appender(_index_shared_docs['where'] % _index_doc_kwargs) def where(self, cond, other=None): - other = _ensure_datetimelike_to_i8(other) - values = _ensure_datetimelike_to_i8(self) + other = _ensure_datetimelike_to_i8(other, to_utc=True) + values = _ensure_datetimelike_to_i8(self, to_utc=True) result = np.where(cond, values, other).astype('i8') - result = self._ensure_localized(result) + result = self._ensure_localized(result, from_utc=True) return self._shallow_copy(result, **self._get_attributes_dict()) @@ -695,14 +699,17 @@ def astype(self, dtype, copy=True): return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy) -def _ensure_datetimelike_to_i8(other): +def _ensure_datetimelike_to_i8(other, to_utc=False): """ helper for coercing an input scalar or array to i8 """ if is_scalar(other) and isna(other): other = iNaT elif isinstance(other, ABCIndexClass): # convert tz if needed if getattr(other, 'tz', None) is not None: - other = other.tz_localize(None).asi8 + if to_utc: + other = other.tz_convert('UTC').asi8 + else: + other = other.tz_localize(None).asi8 else: other = other.asi8 else: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 42df4511578f1..2ec5c2b78a04f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -601,6 +601,30 @@ def test_merge_on_datetime64tz(self): assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]' assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]' + def test_merge_datetime64tz_with_dst_transition(self): + # GH 18885 + df1 = pd.DataFrame(pd.date_range( + '2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'), + columns=['date']) + df1['value'] = 1 + df2 = pd.DataFrame([ + pd.to_datetime('2017-10-29 03:00:00'), + pd.to_datetime('2017-10-29 04:00:00'), + pd.to_datetime('2017-10-29 05:00:00') + ], + columns=['date']) + df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert( + 'Europe/Madrid') + df2['value'] = 2 + result = pd.merge(df1, df2, how='outer', on='date') + expected = pd.DataFrame({ + 'date': pd.date_range( + '2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'), + 'value_x': [1] * 4 + [np.nan] * 3, + 'value_y': [np.nan] * 4 + [2] * 3 + }) + assert_frame_equal(result, expected) + def test_merge_non_unique_period_index(self): # GH #16871 index = pd.period_range('2016-01-01', periods=16, freq='M') From 52602b998e21ce34e4faeafe2263412e5bc41d30 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 24 Sep 2018 20:49:06 -0700 Subject: [PATCH 2/5] split unrelated test that was failing --- pandas/tests/indexing/test_coercion.py | 39 ++++++++++++++++++-------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index e7daefffe5f6f..51a0d07975957 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -590,11 +590,9 @@ def test_where_series_datetime64(self, fill_val, exp_dtype): pd.Timestamp('2011-01-03'), values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), - (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)], - ids=['datetime64', 'datetime64tz']) - def test_where_index_datetime(self, fill_val, exp_dtype): + def test_where_index_datetime(self): + fill_val = pd.Timestamp('2012-01-01') + exp_dtype = 'datetime64[ns]' obj = pd.Index([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-03'), @@ -613,13 +611,32 @@ def test_where_index_datetime(self, fill_val, exp_dtype): pd.Timestamp('2011-01-03'), pd.Timestamp('2012-01-04')]) - if fill_val.tz: - self._assert_where_conversion(obj, cond, values, exp, - 'datetime64[ns]') - pytest.xfail("ToDo: do not ignore timezone, must be object") self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - pytest.xfail("datetime64 + datetime64 -> datetime64 must support" - " scalar") + + @pytest.mark.xfail(reason="ToDo: do not ignore timezone, must be object") + def test_where_index_datetimetz(self): + fill_val = pd.Timestamp('2012-01-01', tz='US/Eastern') + exp_dtype = np.object + obj = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + assert obj.dtype == 'datetime64[ns]' + cond = pd.Index([True, False, True, False]) + + msg = ("Index\\(\\.\\.\\.\\) must be called with a collection " + "of some kind") + with tm.assert_raises_regex(TypeError, msg): + obj.where(cond, fill_val) + + values = pd.Index(pd.date_range(fill_val, periods=4)) + exp = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-02', tz='US/Eastern'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-04', tz='US/Eastern')], + dtype=exp_dtype) + + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) def test_where_index_complex128(self): pass From cff2bca3be9f2ce147f915822996bf44b8a471e8 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 25 Sep 2018 16:30:00 -0700 Subject: [PATCH 3/5] Address review and move asi8 to the last call --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/indexes/datetimelike.py | 47 +++++++++++++++++++---------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 346784c924b6d..129a7200548e4 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -681,7 +681,6 @@ Timezones - Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) - Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) -- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) Offsets ^^^^^^^ @@ -805,6 +804,7 @@ Reshaping - Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) - Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) - Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) +- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7093fcb51ebcb..db44e1bb47281 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -277,7 +277,7 @@ def _evaluate_compare(self, other, op): except TypeError: return result - def _ensure_localized(self, result, ambiguous='raise', from_utc=False): + def _ensure_localized(self, arg, ambiguous='raise', from_utc=False): """ ensure that we are re-localized @@ -286,10 +286,13 @@ def _ensure_localized(self, result, ambiguous='raise', from_utc=False): Parameters ---------- - result : DatetimeIndex / i8 ndarray + arg : DatetimeIndex / i8 ndarray ambiguous : str, bool, or bool-ndarray default 'raise' from_utc : bool + default False + If True, localize the i8 ndarray to UTC first before converting to + the appropriate tz. If False, localize directly to the tz. Returns ------- @@ -298,13 +301,13 @@ def _ensure_localized(self, result, ambiguous='raise', from_utc=False): # reconvert to local tz if getattr(self, 'tz', None) is not None: - if not isinstance(result, ABCIndexClass): - result = self._simple_new(result) + if not isinstance(arg, ABCIndexClass): + arg = self._simple_new(arg) if from_utc: - result = result.tz_localize('UTC').tz_convert(self.tz) + arg = arg.tz_localize('UTC').tz_convert(self.tz) else: - result = result.tz_localize(self.tz, ambiguous=ambiguous) - return result + arg = arg.tz_localize(self.tz, ambiguous=ambiguous) + return arg def _box_values_as_index(self): """ @@ -700,25 +703,37 @@ def astype(self, dtype, copy=True): def _ensure_datetimelike_to_i8(other, to_utc=False): - """ helper for coercing an input scalar or array to i8 """ + """ + helper for coercing an input scalar or array to i8 + + Parameters + ---------- + other : 1d array + to_utc : bool + default False + If True, convert the values to UTC before extracting the i8 values + If False, extract the i8 values directly. + + Returns + ------- + i8 1d array + """ if is_scalar(other) and isna(other): - other = iNaT + return iNaT elif isinstance(other, ABCIndexClass): # convert tz if needed if getattr(other, 'tz', None) is not None: if to_utc: - other = other.tz_convert('UTC').asi8 + other = other.tz_convert('UTC') else: - other = other.tz_localize(None).asi8 - else: - other = other.asi8 + other = other.tz_localize(None) else: try: - other = np.array(other, copy=False).view('i8') + return np.array(other, copy=False).view('i8') except TypeError: # period array cannot be coerces to int - other = Index(other).asi8 - return other + other = Index(other) + return other.asi8 def wrap_arithmetic_op(self, other, result): From cc5e9cff34a3f89c4a9a49423a05f2d31b974737 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 26 Sep 2018 09:37:08 -0700 Subject: [PATCH 4/5] Fix docstrings --- pandas/core/indexes/datetimelike.py | 9 +++------ pandas/tests/indexing/test_coercion.py | 3 ++- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index db44e1bb47281..126908d4254fc 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -287,10 +287,8 @@ def _ensure_localized(self, arg, ambiguous='raise', from_utc=False): Parameters ---------- arg : DatetimeIndex / i8 ndarray - ambiguous : str, bool, or bool-ndarray - default 'raise' - from_utc : bool - default False + ambiguous : str, bool, or bool-ndarray, default 'raise' + from_utc : bool, default False If True, localize the i8 ndarray to UTC first before converting to the appropriate tz. If False, localize directly to the tz. @@ -709,8 +707,7 @@ def _ensure_datetimelike_to_i8(other, to_utc=False): Parameters ---------- other : 1d array - to_utc : bool - default False + to_utc : bool, default False If True, convert the values to UTC before extracting the i8 values If False, extract the i8 values directly. diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 51a0d07975957..2f44cb36eeb11 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -613,7 +613,8 @@ def test_where_index_datetime(self): self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.xfail(reason="ToDo: do not ignore timezone, must be object") + @pytest.mark.xfail( + reason="GH 22839: do not ignore timezone, must be object") def test_where_index_datetimetz(self): fill_val = pd.Timestamp('2012-01-01', tz='US/Eastern') exp_dtype = np.object From 9eca40ee8d68ae0b4e9e0af514e4fe4fe3cce8f4 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 27 Sep 2018 11:03:02 -0700 Subject: [PATCH 5/5] simplify test construction --- pandas/tests/reshape/merge/test_merge.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 2ec5c2b78a04f..50ef622a4147f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -607,15 +607,15 @@ def test_merge_datetime64tz_with_dst_transition(self): '2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'), columns=['date']) df1['value'] = 1 - df2 = pd.DataFrame([ - pd.to_datetime('2017-10-29 03:00:00'), - pd.to_datetime('2017-10-29 04:00:00'), - pd.to_datetime('2017-10-29 05:00:00') - ], - columns=['date']) + df2 = pd.DataFrame({ + 'date': pd.to_datetime([ + '2017-10-29 03:00:00', '2017-10-29 04:00:00', + '2017-10-29 05:00:00' + ]), + 'value': 2 + }) df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert( 'Europe/Madrid') - df2['value'] = 2 result = pd.merge(df1, df2, how='outer', on='date') expected = pd.DataFrame({ 'date': pd.date_range(