Skip to content

Commit 88d0238

Browse files
authored
BUG: timezone comparisions are inconsistent, manifesting in bugs in .concat (#19281)
1 parent 65d1b62 commit 88d0238

File tree

9 files changed

+83
-14
lines changed

9 files changed

+83
-14
lines changed

doc/source/whatsnew/v0.23.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,6 @@ Conversion
427427
- Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`)
428428

429429

430-
-
431430
-
432431
- Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`)
433432
- Bug in subtracting :class:`Series` from ``NaT`` incorrectly returning ``NaT`` (:issue:`19158`)
@@ -503,6 +502,7 @@ Reshaping
503502
- Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`)
504503
- Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`)
505504
- Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`)
505+
- Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`)
506506
-
507507

508508
Numeric

pandas/_libs/interval.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cimport cython
66
import cython
77
from numpy cimport ndarray
88
from tslib import Timestamp
9-
from tslibs.timezones cimport get_timezone
9+
from tslibs.timezones cimport tz_compare
1010

1111
from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE,
1212
PyObject_RichCompare)
@@ -131,7 +131,7 @@ cdef class Interval(IntervalMixin):
131131
if not left <= right:
132132
raise ValueError('left side of interval must be <= right side')
133133
if (isinstance(left, Timestamp) and
134-
get_timezone(left.tzinfo) != get_timezone(right.tzinfo)):
134+
not tz_compare(left.tzinfo, right.tzinfo)):
135135
# GH 18538
136136
msg = ("left and right must have the same time zone, got "
137137
"'{left_tz}' and '{right_tz}'")

pandas/_libs/src/inference.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ cimport cython
55
from tslibs.nattype import NaT
66
from tslibs.conversion cimport convert_to_tsobject
77
from tslibs.timedeltas cimport convert_to_timedelta64
8-
from tslibs.timezones cimport get_timezone
8+
from tslibs.timezones cimport get_timezone, tz_compare
99
from datetime import datetime, timedelta
1010
iNaT = util.get_nat()
1111

@@ -907,7 +907,7 @@ cpdef bint is_datetime_with_singletz_array(ndarray values):
907907
val = values[j]
908908
if val is not NaT:
909909
tz = getattr(val, 'tzinfo', None)
910-
if base_tz != tz and base_tz != get_timezone(tz):
910+
if not tz_compare(base_tz, tz):
911911
return False
912912
break
913913

pandas/_libs/tslibs/conversion.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ from timedeltas cimport cast_from_unit
3535
from timezones cimport (is_utc, is_tzlocal, is_fixed_offset,
3636
treat_tz_as_dateutil, treat_tz_as_pytz,
3737
get_utcoffset, get_dst_info,
38-
get_timezone, maybe_get_tz)
38+
get_timezone, maybe_get_tz, tz_compare)
3939
from parsing import parse_datetime_string
4040

4141
from nattype import nat_strings, NaT
@@ -169,7 +169,7 @@ def datetime_to_datetime64(ndarray[object] values):
169169
elif PyDateTime_Check(val):
170170
if val.tzinfo is not None:
171171
if inferred_tz is not None:
172-
if get_timezone(val.tzinfo) != inferred_tz:
172+
if not tz_compare(val.tzinfo, inferred_tz):
173173
raise ValueError('Array must be all same time zone')
174174
else:
175175
inferred_tz = get_timezone(val.tzinfo)

pandas/_libs/tslibs/timestamps.pyx

+3-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ from np_datetime cimport (reverse_ops, cmp_scalar, check_dts_bounds,
3333
is_leapyear)
3434
from timedeltas import Timedelta
3535
from timedeltas cimport delta_to_nanoseconds
36-
from timezones cimport get_timezone, is_utc, maybe_get_tz, treat_tz_as_pytz
36+
from timezones cimport (
37+
get_timezone, is_utc, maybe_get_tz, treat_tz_as_pytz, tz_compare)
3738

3839
# ----------------------------------------------------------------------
3940
# Constants
@@ -266,7 +267,7 @@ cdef class _Timestamp(datetime):
266267
other = Timestamp(other)
267268

268269
# validate tz's
269-
if get_timezone(self.tzinfo) != get_timezone(other.tzinfo):
270+
if not tz_compare(self.tzinfo, other.tzinfo):
270271
raise TypeError("Timestamp subtraction must have the "
271272
"same timezones or no timezones")
272273

pandas/_libs/tslibs/timezones.pxd

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ cdef bint is_tzlocal(object tz)
77
cdef bint treat_tz_as_pytz(object tz)
88
cdef bint treat_tz_as_dateutil(object tz)
99

10+
cpdef bint tz_compare(object start, object end)
1011
cpdef object get_timezone(object tz)
1112
cpdef object maybe_get_tz(object tz)
1213

pandas/_libs/tslibs/timezones.pyx

+30-1
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@ cdef object get_dst_info(object tz):
275275
def infer_tzinfo(start, end):
276276
if start is not None and end is not None:
277277
tz = start.tzinfo
278-
if not (get_timezone(tz) == get_timezone(end.tzinfo)):
278+
if not tz_compare(tz, end.tzinfo):
279279
msg = 'Inputs must both have the same timezone, {tz1} != {tz2}'
280280
raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo))
281281
elif start is not None:
@@ -285,3 +285,32 @@ def infer_tzinfo(start, end):
285285
else:
286286
tz = None
287287
return tz
288+
289+
290+
cpdef bint tz_compare(object start, object end):
291+
"""
292+
Compare string representations of timezones
293+
294+
The same timezone can be represented as different instances of
295+
timezones. For example
296+
`<DstTzInfo 'Europe/Paris' LMT+0:09:00 STD>` and
297+
`<DstTzInfo 'Europe/Paris' CET+1:00:00 STD>` are essentially same
298+
timezones but aren't evaluted such, but the string representation
299+
for both of these is `'Europe/Paris'`.
300+
301+
This exists only to add a notion of equality to pytz-style zones
302+
that is compatible with the notion of equality expected of tzinfo
303+
subclasses.
304+
305+
Parameters
306+
----------
307+
start : tzinfo
308+
end : tzinfo
309+
310+
Returns:
311+
-------
312+
compare : bint
313+
314+
"""
315+
# GH 18523
316+
return get_timezone(start) == get_timezone(end)

pandas/core/indexes/datetimes.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -521,8 +521,7 @@ def _generate(cls, start, end, periods, name, offset,
521521
tz = tz.localize(date.replace(tzinfo=None)).tzinfo
522522

523523
if tz is not None and inferred_tz is not None:
524-
if not (timezones.get_timezone(inferred_tz) ==
525-
timezones.get_timezone(tz)):
524+
if not timezones.tz_compare(inferred_tz, tz):
526525
raise AssertionError("Inferred time zone not equal to passed "
527526
"time zone")
528527

@@ -1192,7 +1191,7 @@ def _maybe_utc_convert(self, other):
11921191
raise TypeError('Cannot join tz-naive with tz-aware '
11931192
'DatetimeIndex')
11941193

1195-
if self.tz != other.tz:
1194+
if not timezones.tz_compare(self.tz, other.tz):
11961195
this = self.tz_convert('UTC')
11971196
other = other.tz_convert('UTC')
11981197
return this, other
@@ -1296,7 +1295,7 @@ def __iter__(self):
12961295

12971296
def _wrap_union_result(self, other, result):
12981297
name = self.name if self.name == other.name else None
1299-
if self.tz != other.tz:
1298+
if not timezones.tz_compare(self.tz, other.tz):
13001299
raise ValueError('Passed item and index have different timezone')
13011300
return self._simple_new(result, name=name, freq=None, tz=self.tz)
13021301

pandas/tests/reshape/test_concat.py

+39
Original file line numberDiff line numberDiff line change
@@ -2074,6 +2074,45 @@ def test_concat_order(self):
20742074
expected = expected.sort_values()
20752075
tm.assert_index_equal(result, expected)
20762076

2077+
def test_concat_datetime_timezone(self):
2078+
# GH 18523
2079+
idx1 = pd.date_range('2011-01-01', periods=3, freq='H',
2080+
tz='Europe/Paris')
2081+
idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H')
2082+
df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1)
2083+
df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2)
2084+
result = pd.concat([df1, df2], axis=1)
2085+
2086+
exp_idx = DatetimeIndex(['2011-01-01 00:00:00+01:00',
2087+
'2011-01-01 01:00:00+01:00',
2088+
'2011-01-01 02:00:00+01:00'],
2089+
freq='H'
2090+
).tz_localize('UTC').tz_convert('Europe/Paris')
2091+
2092+
expected = pd.DataFrame([[1, 1], [2, 2], [3, 3]],
2093+
index=exp_idx, columns=['a', 'b'])
2094+
2095+
tm.assert_frame_equal(result, expected)
2096+
2097+
idx3 = pd.date_range('2011-01-01', periods=3,
2098+
freq='H', tz='Asia/Tokyo')
2099+
df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3)
2100+
result = pd.concat([df1, df3], axis=1)
2101+
2102+
exp_idx = DatetimeIndex(['2010-12-31 15:00:00+00:00',
2103+
'2010-12-31 16:00:00+00:00',
2104+
'2010-12-31 17:00:00+00:00',
2105+
'2010-12-31 23:00:00+00:00',
2106+
'2011-01-01 00:00:00+00:00',
2107+
'2011-01-01 01:00:00+00:00']
2108+
).tz_localize('UTC')
2109+
2110+
expected = pd.DataFrame([[np.nan, 1], [np.nan, 2], [np.nan, 3],
2111+
[1, np.nan], [2, np.nan], [3, np.nan]],
2112+
index=exp_idx, columns=['a', 'b'])
2113+
2114+
tm.assert_frame_equal(result, expected)
2115+
20772116

20782117
@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
20792118
@pytest.mark.parametrize('dt', np.sctypes['float'])

0 commit comments

Comments
 (0)