Skip to content

BUG: timezone comparisions are inconsistent, manifesting in bugs in .concat #19281

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,6 @@ Conversion
- Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`)


-
-
- Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`)
- Bug in subtracting :class:`Series` from ``NaT`` incorrectly returning ``NaT`` (:issue:`19158`)
Expand Down Expand Up @@ -503,6 +502,7 @@ Reshaping
- Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`)
- Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`)
- Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`)
- Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`)
-

Numeric
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/interval.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ cimport cython
import cython
from numpy cimport ndarray
from tslib import Timestamp
from tslibs.timezones cimport get_timezone
from tslibs.timezones cimport tz_compare

from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE,
PyObject_RichCompare)
Expand Down Expand Up @@ -131,7 +131,7 @@ cdef class Interval(IntervalMixin):
if not left <= right:
raise ValueError('left side of interval must be <= right side')
if (isinstance(left, Timestamp) and
get_timezone(left.tzinfo) != get_timezone(right.tzinfo)):
not tz_compare(left.tzinfo, right.tzinfo)):
# GH 18538
msg = ("left and right must have the same time zone, got "
"'{left_tz}' and '{right_tz}'")
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ cimport cython
from tslibs.nattype import NaT
from tslibs.conversion cimport convert_to_tsobject
from tslibs.timedeltas cimport convert_to_timedelta64
from tslibs.timezones cimport get_timezone
from tslibs.timezones cimport get_timezone, tz_compare
from datetime import datetime, timedelta
iNaT = util.get_nat()

Expand Down Expand Up @@ -907,7 +907,7 @@ cpdef bint is_datetime_with_singletz_array(ndarray values):
val = values[j]
if val is not NaT:
tz = getattr(val, 'tzinfo', None)
if base_tz != tz and base_tz != get_timezone(tz):
if not tz_compare(base_tz, tz):
return False
break

Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ from timedeltas cimport cast_from_unit
from timezones cimport (is_utc, is_tzlocal, is_fixed_offset,
treat_tz_as_dateutil, treat_tz_as_pytz,
get_utcoffset, get_dst_info,
get_timezone, maybe_get_tz)
get_timezone, maybe_get_tz, tz_compare)
from parsing import parse_datetime_string

from nattype import nat_strings, NaT
Expand Down Expand Up @@ -169,7 +169,7 @@ def datetime_to_datetime64(ndarray[object] values):
elif PyDateTime_Check(val):
if val.tzinfo is not None:
if inferred_tz is not None:
if get_timezone(val.tzinfo) != inferred_tz:
if not tz_compare(val.tzinfo, inferred_tz):
raise ValueError('Array must be all same time zone')
else:
inferred_tz = get_timezone(val.tzinfo)
Expand Down
5 changes: 3 additions & 2 deletions pandas/_libs/tslibs/timestamps.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ from np_datetime cimport (reverse_ops, cmp_scalar, check_dts_bounds,
is_leapyear)
from timedeltas import Timedelta
from timedeltas cimport delta_to_nanoseconds
from timezones cimport get_timezone, is_utc, maybe_get_tz, treat_tz_as_pytz
from timezones cimport (
get_timezone, is_utc, maybe_get_tz, treat_tz_as_pytz, tz_compare)

# ----------------------------------------------------------------------
# Constants
Expand Down Expand Up @@ -266,7 +267,7 @@ cdef class _Timestamp(datetime):
other = Timestamp(other)

# validate tz's
if get_timezone(self.tzinfo) != get_timezone(other.tzinfo):
if not tz_compare(self.tzinfo, other.tzinfo):
raise TypeError("Timestamp subtraction must have the "
"same timezones or no timezones")

Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/timezones.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ cdef bint is_tzlocal(object tz)
cdef bint treat_tz_as_pytz(object tz)
cdef bint treat_tz_as_dateutil(object tz)

cpdef bint tz_compare(object start, object end)
cpdef object get_timezone(object tz)
cpdef object maybe_get_tz(object tz)

Expand Down
31 changes: 30 additions & 1 deletion pandas/_libs/tslibs/timezones.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ cdef object get_dst_info(object tz):
def infer_tzinfo(start, end):
if start is not None and end is not None:
tz = start.tzinfo
if not (get_timezone(tz) == get_timezone(end.tzinfo)):
if not tz_compare(tz, end.tzinfo):
msg = 'Inputs must both have the same timezone, {tz1} != {tz2}'
raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo))
elif start is not None:
Expand All @@ -285,3 +285,32 @@ def infer_tzinfo(start, end):
else:
tz = None
return tz


cpdef bint tz_compare(object start, object end):
"""
Compare string representations of timezones

The same timezone can be represented as different instances of
timezones. For example
`<DstTzInfo 'Europe/Paris' LMT+0:09:00 STD>` and
`<DstTzInfo 'Europe/Paris' CET+1:00:00 STD>` are essentially same
timezones but aren't evaluted such, but the string representation
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you say "aren't evaluated as such" are you talking about ==? i.e. start == end is False, but tz_compare(start, end) is True?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TomAugspurger Yes, the reason for this is that pytz generates a separate tzinfo object for each unique offset/tzname combination, and that is what is attached when you call .localize or .normalize. These time zones don't compare equal to one another.

It might be worth noting that this function exists only to add a notion of equality to pytz-style zones that is compatible with the notion of equality expected of tzinfo subclasses.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another thing that might be worth noting is that, as currently written (and probably it would be out of scope to fix this), this will not disambiguate zones that are actually links to other zones.

See the backward file, US/Eastern is a link to America/New_York, for example, but that information is not accessible either from pytz or from the zoneinfo files, as far as I know. Even though pytz.timezone(US/Eastern) and pytz.timezone(America/New_York) always generate the same exact rules, they will be treated as different zones for purposes of tz_compare (which also means concat will merge them and convert to UTC).

for both of these is `'Europe/Paris'`.

This exists only to add a notion of equality to pytz-style zones
that is compatible with the notion of equality expected of tzinfo
subclasses.

Parameters
----------
start : tzinfo
end : tzinfo

Returns:
-------
compare : bint

"""
# GH 18523
return get_timezone(start) == get_timezone(end)
7 changes: 3 additions & 4 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,7 @@ def _generate(cls, start, end, periods, name, offset,
tz = tz.localize(date.replace(tzinfo=None)).tzinfo

if tz is not None and inferred_tz is not None:
if not (timezones.get_timezone(inferred_tz) ==
timezones.get_timezone(tz)):
if not timezones.tz_compare(inferred_tz, tz):
raise AssertionError("Inferred time zone not equal to passed "
"time zone")

Expand Down Expand Up @@ -1192,7 +1191,7 @@ def _maybe_utc_convert(self, other):
raise TypeError('Cannot join tz-naive with tz-aware '
'DatetimeIndex')

if self.tz != other.tz:
if not timezones.tz_compare(self.tz, other.tz):
this = self.tz_convert('UTC')
other = other.tz_convert('UTC')
return this, other
Expand Down Expand Up @@ -1296,7 +1295,7 @@ def __iter__(self):

def _wrap_union_result(self, other, result):
name = self.name if self.name == other.name else None
if self.tz != other.tz:
if not timezones.tz_compare(self.tz, other.tz):
raise ValueError('Passed item and index have different timezone')
return self._simple_new(result, name=name, freq=None, tz=self.tz)

Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/reshape/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2074,6 +2074,45 @@ def test_concat_order(self):
expected = expected.sort_values()
tm.assert_index_equal(result, expected)

def test_concat_datetime_timezone(self):
# GH 18523
idx1 = pd.date_range('2011-01-01', periods=3, freq='H',
tz='Europe/Paris')
idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H')
df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1)
df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2)
result = pd.concat([df1, df2], axis=1)

exp_idx = DatetimeIndex(['2011-01-01 00:00:00+01:00',
'2011-01-01 01:00:00+01:00',
'2011-01-01 02:00:00+01:00'],
freq='H'
).tz_localize('UTC').tz_convert('Europe/Paris')

expected = pd.DataFrame([[1, 1], [2, 2], [3, 3]],
index=exp_idx, columns=['a', 'b'])

tm.assert_frame_equal(result, expected)

idx3 = pd.date_range('2011-01-01', periods=3,
freq='H', tz='Asia/Tokyo')
df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3)
result = pd.concat([df1, df3], axis=1)

exp_idx = DatetimeIndex(['2010-12-31 15:00:00+00:00',
'2010-12-31 16:00:00+00:00',
'2010-12-31 17:00:00+00:00',
'2010-12-31 23:00:00+00:00',
'2011-01-01 00:00:00+00:00',
'2011-01-01 01:00:00+00:00']
).tz_localize('UTC')

expected = pd.DataFrame([[np.nan, 1], [np.nan, 2], [np.nan, 3],
[1, np.nan], [2, np.nan], [3, np.nan]],
index=exp_idx, columns=['a', 'b'])

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
@pytest.mark.parametrize('dt', np.sctypes['float'])
Expand Down