REGR: ensure passed binlabels to pd.cut have a compat dtype on output (#10140)

jreback · jreback · commit d12db3a79edb · 2015-06-02T18:35:52.000-04:00
diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt
@@ -69,6 +69,7 @@ Bug Fixes
 - Bung in ``Series`` arithmetic methods may incorrectly hold names (:issue:`10068`)
 
 - Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`)
+- Regression in ``pd.cut` to ensure passed ``binlabels`` have a compat dtype on output (:issue:`10140`)
 
 
 - Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`)
diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py
@@ -4,7 +4,7 @@
 import numpy as np
 from pandas.compat import zip
 
-from pandas import DataFrame, Series, unique
+from pandas import DataFrame, Series, unique, Index, Categorical, CategoricalIndex
 import pandas.util.testing as tm
 from pandas.util.testing import assertRaisesRegexp
 import pandas.core.common as com
@@ -97,6 +97,32 @@ def test_label_precision(self):
                      '(0.54, 0.72]']
         self.assert_numpy_array_equal(result.categories, ex_levels)
 
+    def test_label_coercion(self):
+        # GH10140
+
+        df = DataFrame({'x' : 100 * np.random.random(100)})
+        df['y'] = df.x**2
+
+        binedges = np.arange(0,110,10)
+        binlabels = np.arange(5,105,10)
+
+        for bl in [Index(binlabels), Categorical(binlabels), Index(binlabels).map(str)]:
+            expected = Index(bl).dtype
+            result = cut(df.x, bins=binedges, labels=bl)
+            self.assertEqual(result.dtype, expected)
+            z = df.groupby(result).y.mean()
+            self.assertEqual(z.index.dtype, expected)
+
+        # reversed categories
+        bl = Categorical(binlabels,categories=binlabels[::-1],ordered=True)
+        expected = Index(bl).dtype
+        result = cut(df.x, bins=binedges, labels=bl)
+        self.assertEqual(result.dtype, expected)
+        z = df.groupby(result).y.mean()
+        self.assertEqual(z.index.dtype, expected)
+        tm.assert_index_equal(z.index,
+                              CategoricalIndex(Categorical.from_codes(np.arange(len(bl)),categories=bl.categories,ordered=True),name='x'))
+
     def test_na_handling(self):
         arr = np.arange(0, 0.75, 0.01)
         arr[::3] = np.nan
diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py
@@ -195,6 +195,14 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
     has_nas = na_mask.any()
 
     if labels is not False:
+
+        def to_categorical(levels):
+            if com.is_categorical_dtype(levels):
+                levels = levels.categories
+            np.putmask(ids, na_mask, 0)
+            fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True)
+            return fac
+
         if labels is None:
             increases = 0
             while True:
@@ -209,15 +217,23 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
                 else:
                     break
 
+
+            fac = to_categorical(levels)
+
+
         else:
             if len(labels) != len(bins) - 1:
                 raise ValueError('Bin labels must be one fewer than '
                                  'the number of bin edges')
-            levels = labels
 
-        levels = np.asarray(levels, dtype=object)
-        np.putmask(ids, na_mask, 0)
-        fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True)
+            # we want to coerce the resultant Categorical to the binlabels type if supplied
+            # if we are passed a Categorical in the binlabels, then use this dtype
+            # 10140
+            labels = _ensure_index(labels)
+            fac = to_categorical(labels)
+            if not com.is_categorical_dtype(labels):
+                fac = type(labels)(np.asarray(fac))
+
     else:
         fac = ids - 1
         if has_nas: