pandas-dev · jreback · Jan 18, 2018 · Jan 17, 2018 · Jan 17, 2018 · Jan 17, 2018
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -427,7 +427,6 @@ Conversion
 - Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`)
 
 
--
 -
 - Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`)
 - Bug in subtracting :class:`Series` from ``NaT`` incorrectly returning ``NaT`` (:issue:`19158`)
@@ -503,6 +502,7 @@ Reshaping
 - Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string.  The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`)
 - Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`)
 - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`)
+- Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`)
 -
 
 Numeric

diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx
@@ -6,7 +6,7 @@ cimport cython
 import cython
 from numpy cimport ndarray
 from tslib import Timestamp
-from tslibs.timezones cimport get_timezone
+from tslibs.timezones cimport tz_compare
 
 from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE,
                              PyObject_RichCompare)
@@ -131,7 +131,7 @@ cdef class Interval(IntervalMixin):
         if not left <= right:
             raise ValueError('left side of interval must be <= right side')
         if (isinstance(left, Timestamp) and
-                get_timezone(left.tzinfo) != get_timezone(right.tzinfo)):
+                not tz_compare(left.tzinfo, right.tzinfo)):
             # GH 18538
             msg = ("left and right must have the same time zone, got "
                    "'{left_tz}' and '{right_tz}'")

diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx
@@ -5,7 +5,7 @@ cimport cython
 from tslibs.nattype import NaT
 from tslibs.conversion cimport convert_to_tsobject
 from tslibs.timedeltas cimport convert_to_timedelta64
-from tslibs.timezones cimport get_timezone
+from tslibs.timezones cimport get_timezone, tz_compare
 from datetime import datetime, timedelta
 iNaT = util.get_nat()
 
@@ -907,7 +907,7 @@ cpdef bint is_datetime_with_singletz_array(ndarray values):
                 val = values[j]
                 if val is not NaT:
                     tz = getattr(val, 'tzinfo', None)
-                    if base_tz != tz and base_tz != get_timezone(tz):
+                    if not tz_compare(base_tz, tz):
                         return False
             break
 

diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
@@ -35,7 +35,7 @@ from timedeltas cimport cast_from_unit
 from timezones cimport (is_utc, is_tzlocal, is_fixed_offset,
                         treat_tz_as_dateutil, treat_tz_as_pytz,
                         get_utcoffset, get_dst_info,
-                        get_timezone, maybe_get_tz)
+                        get_timezone, maybe_get_tz, tz_compare)
 from parsing import parse_datetime_string
 
 from nattype import nat_strings, NaT
@@ -169,7 +169,7 @@ def datetime_to_datetime64(ndarray[object] values):
         elif PyDateTime_Check(val):
             if val.tzinfo is not None:
                 if inferred_tz is not None:
-                    if get_timezone(val.tzinfo) != inferred_tz:
+                    if not tz_compare(val.tzinfo, inferred_tz):
                         raise ValueError('Array must be all same time zone')
                 else:
                     inferred_tz = get_timezone(val.tzinfo)

diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
@@ -33,7 +33,8 @@ from np_datetime cimport (reverse_ops, cmp_scalar, check_dts_bounds,
                           is_leapyear)
 from timedeltas import Timedelta
 from timedeltas cimport delta_to_nanoseconds
-from timezones cimport get_timezone, is_utc, maybe_get_tz, treat_tz_as_pytz
+from timezones cimport (
+    get_timezone, is_utc, maybe_get_tz, treat_tz_as_pytz, tz_compare)
 
 # ----------------------------------------------------------------------
 # Constants
@@ -266,7 +267,7 @@ cdef class _Timestamp(datetime):
             other = Timestamp(other)
 
             # validate tz's
-            if get_timezone(self.tzinfo) != get_timezone(other.tzinfo):
+            if not tz_compare(self.tzinfo, other.tzinfo):
                 raise TypeError("Timestamp subtraction must have the "
                                 "same timezones or no timezones")
 

diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd
@@ -7,6 +7,7 @@ cdef bint is_tzlocal(object tz)
 cdef bint treat_tz_as_pytz(object tz)
 cdef bint treat_tz_as_dateutil(object tz)
 
+cpdef bint tz_compare(object start, object end)
 cpdef object get_timezone(object tz)
 cpdef object maybe_get_tz(object tz)
 

diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx
@@ -275,7 +275,7 @@ cdef object get_dst_info(object tz):
 def infer_tzinfo(start, end):
     if start is not None and end is not None:
         tz = start.tzinfo
-        if not (get_timezone(tz) == get_timezone(end.tzinfo)):
+        if not tz_compare(tz, end.tzinfo):
             msg = 'Inputs must both have the same timezone, {tz1} != {tz2}'
             raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo))
     elif start is not None:
@@ -285,3 +285,32 @@ def infer_tzinfo(start, end):
     else:
         tz = None
     return tz
+
+
+cpdef bint tz_compare(object start, object end):
+    """
+    Compare string representations of timezones
+
+    The same timezone can be represented as different instances of
+    timezones. For example
+    `<DstTzInfo 'Europe/Paris' LMT+0:09:00 STD>` and
+    `<DstTzInfo 'Europe/Paris' CET+1:00:00 STD>` are essentially same
+    timezones but aren't evaluted such, but the string representation
+    for both of these is `'Europe/Paris'`.
+
+    This exists only to add a notion of equality to pytz-style zones
+    that is compatible with the notion of equality expected of tzinfo
+    subclasses.
+
+    Parameters
+    ----------
+    start : tzinfo
+    end : tzinfo
+
+    Returns:
+    -------
+    compare : bint
+
+    """
+    # GH 18523
+    return get_timezone(start) == get_timezone(end)
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -521,8 +521,7 @@ def _generate(cls, start, end, periods, name, offset,
                 tz = tz.localize(date.replace(tzinfo=None)).tzinfo
 
         if tz is not None and inferred_tz is not None:
-            if not (timezones.get_timezone(inferred_tz) ==
-                    timezones.get_timezone(tz)):
+            if not timezones.tz_compare(inferred_tz, tz):
                 raise AssertionError("Inferred time zone not equal to passed "
                                      "time zone")
 
@@ -1192,7 +1191,7 @@ def _maybe_utc_convert(self, other):
                 raise TypeError('Cannot join tz-naive with tz-aware '
                                 'DatetimeIndex')
 
-            if self.tz != other.tz:
+            if not timezones.tz_compare(self.tz, other.tz):
                 this = self.tz_convert('UTC')
                 other = other.tz_convert('UTC')
         return this, other
@@ -1296,7 +1295,7 @@ def __iter__(self):
 
     def _wrap_union_result(self, other, result):
         name = self.name if self.name == other.name else None
-        if self.tz != other.tz:
+        if not timezones.tz_compare(self.tz, other.tz):
             raise ValueError('Passed item and index have different timezone')
         return self._simple_new(result, name=name, freq=None, tz=self.tz)
 

diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
@@ -2074,6 +2074,45 @@ def test_concat_order(self):
             expected = expected.sort_values()
         tm.assert_index_equal(result, expected)
 
+    def test_concat_datetime_timezone(self):
+        # GH 18523
+        idx1 = pd.date_range('2011-01-01', periods=3, freq='H',
+                             tz='Europe/Paris')
+        idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H')
+        df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1)
+        df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2)
+        result = pd.concat([df1, df2], axis=1)
+
+        exp_idx = DatetimeIndex(['2011-01-01 00:00:00+01:00',
+                                 '2011-01-01 01:00:00+01:00',
+                                 '2011-01-01 02:00:00+01:00'],
+                                freq='H'
+                                ).tz_localize('UTC').tz_convert('Europe/Paris')
+
+        expected = pd.DataFrame([[1, 1], [2, 2], [3, 3]],
+                                index=exp_idx, columns=['a', 'b'])
+
+        tm.assert_frame_equal(result, expected)
+
+        idx3 = pd.date_range('2011-01-01', periods=3,
+                             freq='H', tz='Asia/Tokyo')
+        df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3)
+        result = pd.concat([df1, df3], axis=1)
+
+        exp_idx = DatetimeIndex(['2010-12-31 15:00:00+00:00',
+                                 '2010-12-31 16:00:00+00:00',
+                                 '2010-12-31 17:00:00+00:00',
+                                 '2010-12-31 23:00:00+00:00',
+                                 '2011-01-01 00:00:00+00:00',
+                                 '2011-01-01 01:00:00+00:00']
+                                ).tz_localize('UTC')
+
+        expected = pd.DataFrame([[np.nan, 1], [np.nan, 2], [np.nan, 3],
+                                 [1, np.nan], [2, np.nan], [3, np.nan]],
+                                index=exp_idx, columns=['a', 'b'])
+
+        tm.assert_frame_equal(result, expected)
+
 
 @pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
 @pytest.mark.parametrize('dt', np.sctypes['float'])