From 85575a943c86073f2a6a4e6944ad0e91553cb3ba Mon Sep 17 00:00:00 2001 From: Mridul Seth Date: Wed, 17 Jan 2018 06:04:44 -0500 Subject: [PATCH 1/4] BUG: add _tz_compare() to handle comparisons of pytz tzs correctly closes #18523 --- pandas/core/indexes/datetimes.py | 26 ++++++++++++- .../indexes/datetimes/test_construction.py | 20 ++++++++++ pandas/tests/indexes/datetimes/test_tools.py | 9 +++++ pandas/tests/reshape/test_concat.py | 39 +++++++++++++++++++ 4 files changed, 92 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d83d2d2c93ec8..d48e35b5d4e61 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1181,6 +1181,28 @@ def join(self, other, how='left', level=None, return_indexers=False, return Index.join(this, other, how=how, level=level, return_indexers=return_indexers, sort=sort) + def _tz_compare(self, other): + """ + Compare string representations of timezones of two DatetimeIndex as + directly comparing equality is broken. The same timezone can be + represented as different instances of timezones. For example + `` and + `` are essentially same + timezones but aren't evaluted such, but the string representation + for both of these is `'Europe/Paris'`. + + Parameters + ---------- + other: DatetimeIndex + + Returns: + ------- + compare : Boolean + + """ + # GH 18523 + return str(self.tzinfo) == str(other.tzinfo) + def _maybe_utc_convert(self, other): this = self if isinstance(other, DatetimeIndex): @@ -1192,7 +1214,7 @@ def _maybe_utc_convert(self, other): raise TypeError('Cannot join tz-naive with tz-aware ' 'DatetimeIndex') - if self.tz != other.tz: + if not self._tz_compare(other): this = self.tz_convert('UTC') other = other.tz_convert('UTC') return this, other @@ -1296,7 +1318,7 @@ def __iter__(self): def _wrap_union_result(self, other, result): name = self.name if self.name == other.name else None - if self.tz != other.tz: + if not self._tz_compare(other): raise ValueError('Passed item and index have different timezone') return self._simple_new(result, name=name, freq=None, tz=self.tz) diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index b59dd25ead57f..588d5968bc932 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -443,6 +443,26 @@ def test_000constructor_resolution(self): assert idx.nanosecond[0] == t1.nanosecond + def test_concat(self): + idx1 = pd.date_range('2011-01-01', periods=3, freq='H', + tz='Europe/Paris') + idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H') + df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1) + df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2) + res = pd.concat([df1, df2], axis=1) + + assert str(res.index.tzinfo) == str(df1.index.tzinfo) + assert str(res.index.tzinfo) == str(df2.index.tzinfo) + + idx3 = pd.date_range('2011-01-01', periods=3, + freq='H', tz='Asia/Tokyo') + df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3) + res = pd.concat([df1, df3], axis=1) + + assert str(res.index.tzinfo) == 'UTC' + assert str(res.index.tzinfo) != str(df1.index.tzinfo) + assert str(res.index.tzinfo) != str(df3.index.tzinfo) + class TestTimeSeries(object): diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 44f3c21d23e62..935def1ffced1 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -918,6 +918,15 @@ def test_to_datetime_list_of_integers(self): tm.assert_index_equal(rng, result) +<<<<<<< HEAD +======= + def test_to_datetime_freq(self): + xp = bdate_range('2000-1-1', periods=10, tz='UTC') + rs = xp.to_datetime() + assert xp.freq == rs.freq + assert xp._tz_compare(rs) + +>>>>>>> PR_TOOL_MERGE_PR_18596 def test_to_datetime_overflow(self): # gh-17637 # we are overflowing Timedelta range here diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 150410e404305..a600e4cfeb3bc 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2074,6 +2074,45 @@ def test_concat_order(self): expected = expected.sort_values() tm.assert_index_equal(result, expected) + def test_concat_datetime_timezone(self): + # GH 18523 + idx1 = pd.date_range('2011-01-01', periods=3, freq='H', + tz='Europe/Paris') + idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H') + df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1) + df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2) + res = pd.concat([df1, df2], axis=1) + + exp_idx = DatetimeIndex(['2011-01-01 00:00:00+01:00', + '2011-01-01 01:00:00+01:00', + '2011-01-01 02:00:00+01:00'], + freq='H' + ).tz_localize('UTC').tz_convert('Europe/Paris') + + exp = pd.DataFrame([[1, 1], [2, 2], [3, 3]], + index=exp_idx, columns=['a', 'b']) + + tm.assert_frame_equal(res, exp) + + idx3 = pd.date_range('2011-01-01', periods=3, + freq='H', tz='Asia/Tokyo') + df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3) + res = pd.concat([df1, df3], axis=1) + + exp_idx = DatetimeIndex(['2010-12-31 15:00:00+00:00', + '2010-12-31 16:00:00+00:00', + '2010-12-31 17:00:00+00:00', + '2010-12-31 23:00:00+00:00', + '2011-01-01 00:00:00+00:00', + '2011-01-01 01:00:00+00:00'] + ).tz_localize('UTC') + + exp = pd.DataFrame([[np.nan, 1], [np.nan, 2], [np.nan, 3], + [1, np.nan], [2, np.nan], [3, np.nan]], + index=exp_idx, columns=['a', 'b']) + + tm.assert_frame_equal(res, exp) + @pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel]) @pytest.mark.parametrize('dt', np.sctypes['float']) From fca9e34caa6096a9cad105d34d66c7ff84f5df09 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 17 Jan 2018 06:06:31 -0500 Subject: [PATCH 2/4] fixup --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/_libs/interval.pyx | 4 +-- pandas/_libs/src/inference.pyx | 4 +-- pandas/_libs/tslibs/conversion.pyx | 4 +-- pandas/_libs/tslibs/timestamps.pyx | 5 ++-- pandas/_libs/tslibs/timezones.pxd | 1 + pandas/_libs/tslibs/timezones.pyx | 27 +++++++++++++++++- pandas/core/indexes/datetimes.py | 29 ++------------------ pandas/tests/indexes/datetimes/test_tools.py | 9 ------ pandas/tests/reshape/test_concat.py | 18 ++++++------ 10 files changed, 49 insertions(+), 54 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 853d5cee11cd1..c4b56a52ea589 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -427,7 +427,6 @@ Conversion - Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`) -- - - Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`) - Bug in subtracting :class:`Series` from ``NaT`` incorrectly returning ``NaT`` (:issue:`19158`) @@ -503,6 +502,7 @@ Reshaping - Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`) - Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`) - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) +- Bug in timezone comparisons, manifesting as a conversion of the index to UTC when ``.reset_index()`` (:issue:`18523`) - Numeric diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index f1da60057186c..e1ffd450c9a68 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -6,7 +6,7 @@ cimport cython import cython from numpy cimport ndarray from tslib import Timestamp -from tslibs.timezones cimport get_timezone +from tslibs.timezones cimport tz_compare from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, PyObject_RichCompare) @@ -131,7 +131,7 @@ cdef class Interval(IntervalMixin): if not left <= right: raise ValueError('left side of interval must be <= right side') if (isinstance(left, Timestamp) and - get_timezone(left.tzinfo) != get_timezone(right.tzinfo)): + not tz_compare(left.tzinfo, right.tzinfo)): # GH 18538 msg = ("left and right must have the same time zone, got " "'{left_tz}' and '{right_tz}'") diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index b74b3a79fd69a..e15f276b39bf8 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -5,7 +5,7 @@ cimport cython from tslibs.nattype import NaT from tslibs.conversion cimport convert_to_tsobject from tslibs.timedeltas cimport convert_to_timedelta64 -from tslibs.timezones cimport get_timezone +from tslibs.timezones cimport get_timezone, tz_compare from datetime import datetime, timedelta iNaT = util.get_nat() @@ -907,7 +907,7 @@ cpdef bint is_datetime_with_singletz_array(ndarray values): val = values[j] if val is not NaT: tz = getattr(val, 'tzinfo', None) - if base_tz != tz and base_tz != get_timezone(tz): + if not tz_compare(base_tz, tz): return False break diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 53abdd013ec37..9cfe41172fedc 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -35,7 +35,7 @@ from timedeltas cimport cast_from_unit from timezones cimport (is_utc, is_tzlocal, is_fixed_offset, treat_tz_as_dateutil, treat_tz_as_pytz, get_utcoffset, get_dst_info, - get_timezone, maybe_get_tz) + get_timezone, maybe_get_tz, tz_compare) from parsing import parse_datetime_string from nattype import nat_strings, NaT @@ -169,7 +169,7 @@ def datetime_to_datetime64(ndarray[object] values): elif PyDateTime_Check(val): if val.tzinfo is not None: if inferred_tz is not None: - if get_timezone(val.tzinfo) != inferred_tz: + if not tz_compare(val.tzinfo, inferred_tz): raise ValueError('Array must be all same time zone') else: inferred_tz = get_timezone(val.tzinfo) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index de31643742d87..1ddb299598fd0 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -33,7 +33,8 @@ from np_datetime cimport (reverse_ops, cmp_scalar, check_dts_bounds, is_leapyear) from timedeltas import Timedelta from timedeltas cimport delta_to_nanoseconds -from timezones cimport get_timezone, is_utc, maybe_get_tz, treat_tz_as_pytz +from timezones cimport ( + get_timezone, is_utc, maybe_get_tz, treat_tz_as_pytz, tz_compare) # ---------------------------------------------------------------------- # Constants @@ -266,7 +267,7 @@ cdef class _Timestamp(datetime): other = Timestamp(other) # validate tz's - if get_timezone(self.tzinfo) != get_timezone(other.tzinfo): + if not tz_compare(self.tzinfo, other.tzinfo): raise TypeError("Timestamp subtraction must have the " "same timezones or no timezones") diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 95e0474b3a174..67353f3eec614 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -7,6 +7,7 @@ cdef bint is_tzlocal(object tz) cdef bint treat_tz_as_pytz(object tz) cdef bint treat_tz_as_dateutil(object tz) +cpdef bint tz_compare(object start, object end) cpdef object get_timezone(object tz) cpdef object maybe_get_tz(object tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index fdcf40337fab9..eab451d611dc9 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -275,7 +275,7 @@ cdef object get_dst_info(object tz): def infer_tzinfo(start, end): if start is not None and end is not None: tz = start.tzinfo - if not (get_timezone(tz) == get_timezone(end.tzinfo)): + if not tz_compare(tz, end.tzinfo): msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) elif start is not None: @@ -285,3 +285,28 @@ def infer_tzinfo(start, end): else: tz = None return tz + + +cpdef bint tz_compare(object start, object end): + """ + Compare string representations of timezones + + The same timezone can be represented as different instances of + timezones. For example + `` and + `` are essentially same + timezones but aren't evaluted such, but the string representation + for both of these is `'Europe/Paris'`. + + Parameters + ---------- + start : tzinfo + end : tzinfo + + Returns: + ------- + compare : bint + + """ + # GH 18523 + return get_timezone(start) == get_timezone(end) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d48e35b5d4e61..4ec929947783c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -521,8 +521,7 @@ def _generate(cls, start, end, periods, name, offset, tz = tz.localize(date.replace(tzinfo=None)).tzinfo if tz is not None and inferred_tz is not None: - if not (timezones.get_timezone(inferred_tz) == - timezones.get_timezone(tz)): + if not timezones.tz_compare(inferred_tz, tz): raise AssertionError("Inferred time zone not equal to passed " "time zone") @@ -1181,28 +1180,6 @@ def join(self, other, how='left', level=None, return_indexers=False, return Index.join(this, other, how=how, level=level, return_indexers=return_indexers, sort=sort) - def _tz_compare(self, other): - """ - Compare string representations of timezones of two DatetimeIndex as - directly comparing equality is broken. The same timezone can be - represented as different instances of timezones. For example - `` and - `` are essentially same - timezones but aren't evaluted such, but the string representation - for both of these is `'Europe/Paris'`. - - Parameters - ---------- - other: DatetimeIndex - - Returns: - ------- - compare : Boolean - - """ - # GH 18523 - return str(self.tzinfo) == str(other.tzinfo) - def _maybe_utc_convert(self, other): this = self if isinstance(other, DatetimeIndex): @@ -1214,7 +1191,7 @@ def _maybe_utc_convert(self, other): raise TypeError('Cannot join tz-naive with tz-aware ' 'DatetimeIndex') - if not self._tz_compare(other): + if not timezones.tz_compare(self.tz, other.tz): this = self.tz_convert('UTC') other = other.tz_convert('UTC') return this, other @@ -1318,7 +1295,7 @@ def __iter__(self): def _wrap_union_result(self, other, result): name = self.name if self.name == other.name else None - if not self._tz_compare(other): + if not timezones.tz_compare(self.tz, other.tz): raise ValueError('Passed item and index have different timezone') return self._simple_new(result, name=name, freq=None, tz=self.tz) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 935def1ffced1..44f3c21d23e62 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -918,15 +918,6 @@ def test_to_datetime_list_of_integers(self): tm.assert_index_equal(rng, result) -<<<<<<< HEAD -======= - def test_to_datetime_freq(self): - xp = bdate_range('2000-1-1', periods=10, tz='UTC') - rs = xp.to_datetime() - assert xp.freq == rs.freq - assert xp._tz_compare(rs) - ->>>>>>> PR_TOOL_MERGE_PR_18596 def test_to_datetime_overflow(self): # gh-17637 # we are overflowing Timedelta range here diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index a600e4cfeb3bc..7e126dd56775b 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2081,7 +2081,7 @@ def test_concat_datetime_timezone(self): idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H') df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1) df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2) - res = pd.concat([df1, df2], axis=1) + result = pd.concat([df1, df2], axis=1) exp_idx = DatetimeIndex(['2011-01-01 00:00:00+01:00', '2011-01-01 01:00:00+01:00', @@ -2089,15 +2089,15 @@ def test_concat_datetime_timezone(self): freq='H' ).tz_localize('UTC').tz_convert('Europe/Paris') - exp = pd.DataFrame([[1, 1], [2, 2], [3, 3]], - index=exp_idx, columns=['a', 'b']) + expected = pd.DataFrame([[1, 1], [2, 2], [3, 3]], + index=exp_idx, columns=['a', 'b']) - tm.assert_frame_equal(res, exp) + tm.assert_frame_equal(result, expected) idx3 = pd.date_range('2011-01-01', periods=3, freq='H', tz='Asia/Tokyo') df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3) - res = pd.concat([df1, df3], axis=1) + result = pd.concat([df1, df3], axis=1) exp_idx = DatetimeIndex(['2010-12-31 15:00:00+00:00', '2010-12-31 16:00:00+00:00', @@ -2107,11 +2107,11 @@ def test_concat_datetime_timezone(self): '2011-01-01 01:00:00+00:00'] ).tz_localize('UTC') - exp = pd.DataFrame([[np.nan, 1], [np.nan, 2], [np.nan, 3], - [1, np.nan], [2, np.nan], [3, np.nan]], - index=exp_idx, columns=['a', 'b']) + expected = pd.DataFrame([[np.nan, 1], [np.nan, 2], [np.nan, 3], + [1, np.nan], [2, np.nan], [3, np.nan]], + index=exp_idx, columns=['a', 'b']) - tm.assert_frame_equal(res, exp) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel]) From 473bd87b711f6bfb254dd87278b75eb3d48c887e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 17 Jan 2018 06:43:00 -0500 Subject: [PATCH 3/4] fi --- doc/source/whatsnew/v0.23.0.txt | 2 +- .../indexes/datetimes/test_construction.py | 20 ------------------- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index c4b56a52ea589..a93e0b1a3b0dd 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -502,7 +502,7 @@ Reshaping - Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`) - Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`) - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) -- Bug in timezone comparisons, manifesting as a conversion of the index to UTC when ``.reset_index()`` (:issue:`18523`) +- Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) - Numeric diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index 588d5968bc932..b59dd25ead57f 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -443,26 +443,6 @@ def test_000constructor_resolution(self): assert idx.nanosecond[0] == t1.nanosecond - def test_concat(self): - idx1 = pd.date_range('2011-01-01', periods=3, freq='H', - tz='Europe/Paris') - idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H') - df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1) - df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2) - res = pd.concat([df1, df2], axis=1) - - assert str(res.index.tzinfo) == str(df1.index.tzinfo) - assert str(res.index.tzinfo) == str(df2.index.tzinfo) - - idx3 = pd.date_range('2011-01-01', periods=3, - freq='H', tz='Asia/Tokyo') - df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3) - res = pd.concat([df1, df3], axis=1) - - assert str(res.index.tzinfo) == 'UTC' - assert str(res.index.tzinfo) != str(df1.index.tzinfo) - assert str(res.index.tzinfo) != str(df3.index.tzinfo) - class TestTimeSeries(object): From e95f16fcd0127a4952617e414eb785d7deb5a693 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 17 Jan 2018 19:13:32 -0500 Subject: [PATCH 4/4] fix up doc string --- pandas/_libs/tslibs/timezones.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index eab451d611dc9..242b8262a8721 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -298,6 +298,10 @@ cpdef bint tz_compare(object start, object end): timezones but aren't evaluted such, but the string representation for both of these is `'Europe/Paris'`. + This exists only to add a notion of equality to pytz-style zones + that is compatible with the notion of equality expected of tzinfo + subclasses. + Parameters ---------- start : tzinfo