BUG: Retain timezone dtype with cut and qcut (#19890)

mroeschke · jreback · commit cc1b934a1be6 · 2018-03-09T06:13:50.000-05:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -1019,6 +1019,7 @@ Reshaping
 - Bug in :func:`DataFrame.iterrows`, which would infers strings not compliant to `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_ to datetimes (:issue:`19671`)
 - Bug in :class:`Series` constructor with ``Categorical`` where a ```ValueError`` is not raised when an index of different length is given (:issue:`19342`)
 - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`)
+- Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`)
 
 Other
 ^^^^^
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -1,6 +1,7 @@
 """
 Quantilization functions and related stuff
 """
+from functools import partial
 
 from pandas.core.dtypes.missing import isna
 from pandas.core.dtypes.common import (
@@ -9,6 +10,7 @@
     is_categorical_dtype,
     is_datetime64_dtype,
     is_timedelta64_dtype,
+    is_datetime64tz_dtype,
     _ensure_int64)
 
 import pandas.core.algorithms as algos
@@ -239,7 +241,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None,
     ids = _ensure_int64(bins.searchsorted(x, side=side))
 
     if include_lowest:
-        ids[x == bins[0]] = 1
+        # Numpy 1.9 support: ensure this mask is a Numpy array
+        ids[np.asarray(x == bins[0])] = 1
 
     na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
     has_nas = na_mask.any()
@@ -284,12 +287,14 @@ def _coerce_to_type(x):
     """
     dtype = None
 
-    if is_timedelta64_dtype(x):
-        x = to_timedelta(x)
-        dtype = np.timedelta64
+    if is_datetime64tz_dtype(x):
+        dtype = x.dtype
     elif is_datetime64_dtype(x):
         x = to_datetime(x)
         dtype = np.datetime64
+    elif is_timedelta64_dtype(x):
+        x = to_timedelta(x)
+        dtype = np.timedelta64
 
     if dtype is not None:
         # GH 19768: force NaT to NaN during integer conversion
@@ -305,7 +310,7 @@ def _convert_bin_to_numeric_type(bins, dtype):
 
     Parameters
     ----------
-    bins : list-liek of bins
+    bins : list-like of bins
     dtype : dtype of data
 
     Raises
@@ -318,7 +323,7 @@ def _convert_bin_to_numeric_type(bins, dtype):
             bins = to_timedelta(bins).view(np.int64)
         else:
             raise ValueError("bins must be of timedelta64 dtype")
-    elif is_datetime64_dtype(dtype):
+    elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
         if bins_dtype in ['datetime', 'datetime64']:
             bins = to_datetime(bins).view(np.int64)
         else:
@@ -333,7 +338,10 @@ def _format_labels(bins, precision, right=True,
 
     closed = 'right' if right else 'left'
 
-    if is_datetime64_dtype(dtype):
+    if is_datetime64tz_dtype(dtype):
+        formatter = partial(Timestamp, tz=dtype.tz)
+        adjust = lambda x: x - Timedelta('1ns')
+    elif is_datetime64_dtype(dtype):
         formatter = Timestamp
         adjust = lambda x: x - Timedelta('1ns')
     elif is_timedelta64_dtype(dtype):
@@ -372,7 +380,13 @@ def _preprocess_for_cut(x):
         series_index = x.index
         name = x.name
 
-    x = np.asarray(x)
+    # Check that the passed array is a Pandas or Numpy object
+    # We don't want to strip away a Pandas data-type here (e.g. datetimetz)
+    ndim = getattr(x, 'ndim', None)
+    if ndim is None:
+        x = np.asarray(x)
+    if x.ndim != 1:
+        raise ValueError("Input array must be 1 dimensional")
 
     return x_is_series, series_index, name, x
 
diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py
@@ -4,7 +4,7 @@
 import numpy as np
 from pandas.compat import zip
 
-from pandas import (Series, isna, to_datetime, DatetimeIndex,
+from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index,
                     Timestamp, Interval, IntervalIndex, Categorical,
                     cut, qcut, date_range, NaT, TimedeltaIndex)
 from pandas.tseries.offsets import Nano, Day
@@ -104,6 +104,12 @@ def test_cut_corner(self):
 
         pytest.raises(ValueError, cut, [1, 2, 3], 0.5)
 
+    @pytest.mark.parametrize('arg', [2, np.eye(2), DataFrame(np.eye(2))])
+    @pytest.mark.parametrize('cut_func', [cut, qcut])
+    def test_cut_not_1d_arg(self, arg, cut_func):
+        with pytest.raises(ValueError):
+            cut_func(arg, 2)
+
     def test_cut_out_of_range_more(self):
         # #1511
         s = Series([0, -1, 0, 1, -3], name='x')
@@ -251,18 +257,6 @@ def test_qcut_nas(self):
         result = qcut(arr, 4)
         assert isna(result[:20]).all()
 
-    @pytest.mark.parametrize('s', [
-        Series(DatetimeIndex(['20180101', NaT, '20180103'])),
-        Series(TimedeltaIndex(['0 days', NaT, '2 days']))],
-        ids=lambda x: str(x.dtype))
-    def test_qcut_nat(self, s):
-        # GH 19768
-        intervals = IntervalIndex.from_tuples(
-            [(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])])
-        expected = Series(Categorical(intervals, ordered=True))
-        result = qcut(s, 2)
-        tm.assert_series_equal(result, expected)
-
     def test_qcut_index(self):
         result = qcut([0, 2], 2)
         intervals = [Interval(-0.001, 1), Interval(1, 2)]
@@ -452,6 +446,37 @@ def test_single_bin(self):
         result = cut(s, 1, labels=False)
         tm.assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize(
+        "array_1_writeable, array_2_writeable",
+        [(True, True), (True, False), (False, False)])
+    def test_cut_read_only(self, array_1_writeable, array_2_writeable):
+        # issue 18773
+        array_1 = np.arange(0, 100, 10)
+        array_1.flags.writeable = array_1_writeable
+
+        array_2 = np.arange(0, 100, 10)
+        array_2.flags.writeable = array_2_writeable
+
+        hundred_elements = np.arange(100)
+
+        tm.assert_categorical_equal(cut(hundred_elements, array_1),
+                                    cut(hundred_elements, array_2))
+
+
+class TestDatelike(object):
+
+    @pytest.mark.parametrize('s', [
+        Series(DatetimeIndex(['20180101', NaT, '20180103'])),
+        Series(TimedeltaIndex(['0 days', NaT, '2 days']))],
+        ids=lambda x: str(x.dtype))
+    def test_qcut_nat(self, s):
+        # GH 19768
+        intervals = IntervalIndex.from_tuples(
+            [(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])])
+        expected = Series(Categorical(intervals, ordered=True))
+        result = qcut(s, 2)
+        tm.assert_series_equal(result, expected)
+
     def test_datetime_cut(self):
         # GH 14714
         # testing for time data to be present as series
@@ -488,6 +513,47 @@ def test_datetime_cut(self):
         result, bins = cut(data, 3, retbins=True)
         tm.assert_series_equal(Series(result), expected)
 
+    @pytest.mark.parametrize('bins', [
+        3, [Timestamp('2013-01-01 04:57:07.200000'),
+            Timestamp('2013-01-01 21:00:00'),
+            Timestamp('2013-01-02 13:00:00'),
+            Timestamp('2013-01-03 05:00:00')]])
+    @pytest.mark.parametrize('box', [list, np.array, Index, Series])
+    def test_datetimetz_cut(self, bins, box):
+        # GH 19872
+        tz = 'US/Eastern'
+        s = Series(date_range('20130101', periods=3, tz=tz))
+        if not isinstance(bins, int):
+            bins = box(bins)
+        result = cut(s, bins)
+        expected = (
+            Series(IntervalIndex([
+                Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz),
+                         Timestamp('2013-01-01 16:00:00', tz=tz)),
+                Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
+                         Timestamp('2013-01-02 08:00:00', tz=tz)),
+                Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
+                         Timestamp('2013-01-03 00:00:00', tz=tz))]))
+            .astype(CDT(ordered=True)))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)])
+    def test_datetimetz_qcut(self, bins):
+        # GH 19872
+        tz = 'US/Eastern'
+        s = Series(date_range('20130101', periods=3, tz=tz))
+        result = qcut(s, bins)
+        expected = (
+            Series(IntervalIndex([
+                Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz),
+                         Timestamp('2013-01-01 16:00:00', tz=tz)),
+                Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
+                         Timestamp('2013-01-02 08:00:00', tz=tz)),
+                Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
+                         Timestamp('2013-01-03 00:00:00', tz=tz))]))
+            .astype(CDT(ordered=True)))
+        tm.assert_series_equal(result, expected)
+
     def test_datetime_bin(self):
         data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')]
         bin_data = ['2012-12-12', '2012-12-14', '2012-12-16']
@@ -523,19 +589,3 @@ def f():
         mask = result.isna()
         tm.assert_numpy_array_equal(
             mask, np.array([False, True, True, True, True]))
-
-    @pytest.mark.parametrize(
-        "array_1_writeable, array_2_writeable",
-        [(True, True), (True, False), (False, False)])
-    def test_cut_read_only(self, array_1_writeable, array_2_writeable):
-        # issue 18773
-        array_1 = np.arange(0, 100, 10)
-        array_1.flags.writeable = array_1_writeable
-
-        array_2 = np.arange(0, 100, 10)
-        array_2.flags.writeable = array_2_writeable
-
-        hundred_elements = np.arange(100)
-
-        tm.assert_categorical_equal(cut(hundred_elements, array_1),
-                                    cut(hundred_elements, array_2))