REGR: ensure passed binlabels to pd.cut have a compat dtype on output (#10140)

jreback · jreback · commit 0b9bb4165c57 · 2015-06-02T20:03:27.000-04:00
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -792,6 +792,8 @@ excluded. So there will never be an "NA group" or "NaT group". This was not the
 versions of pandas, but users were generally discarding the NA group anyway
 (and supporting it was an implementation headache).
 
+.. _groupby.categorical:
+
 Grouping with ordered factors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -801,9 +803,14 @@ can be used as group keys. If so, the order of the levels will be preserved:
 .. ipython:: python
 
    data = Series(np.random.randn(100))
-
    factor = qcut(data, [0, .25, .5, .75, 1.])
+   data.groupby(factor).mean()
+
+Further, one can name the bins to produce a custom-labeled binner.
+
+.. ipython:: python
 
+   factor = qcut(data, [0, .25, .5, .75, 1.], labels=['small','medium','large','x-large'])
    data.groupby(factor).mean()
 
 .. _groupby.specify:
diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst
@@ -417,7 +417,8 @@ Tiling
 
 The ``cut`` function computes groupings for the values of the input array and
 is often used to transform continuous variables to discrete or categorical
-variables:
+variables. These will result in a ``Categorical`` dtype, where the categories
+are the bins.
 
 .. ipython:: python
 
@@ -433,6 +434,13 @@ Alternatively we can specify custom bin-edges:
 
    pd.cut(ages, bins=[0, 18, 35, 70])
 
+Furthermore, one can specify ``labels`` to have custom labels.
+
+.. ipython:: python
+
+   pd.cut(ages, bins=[0, 18, 35, 70], labels=['child','adult','senior'])
+
+``cut/qcut`` are often used as groupers, see the :ref:`grouping with ordered factors<groupby.categorical>` for more.
 
 .. _reshaping.dummies:
 
diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt
@@ -69,6 +69,7 @@ Bug Fixes
 - Bung in ``Series`` arithmetic methods may incorrectly hold names (:issue:`10068`)
 
 - Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`)
+- Regression in ``pd.cut` to ensure passed ``binlabels`` have a compat dtype on output (:issue:`10140`)
 
 
 - Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`)
diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py
@@ -4,7 +4,8 @@
 import numpy as np
 from pandas.compat import zip
 
-from pandas import DataFrame, Series, unique
+from pandas import (DataFrame, Series, unique, Index, Categorical, CategoricalIndex,
+                    DatetimeIndex, TimedeltaIndex)
 import pandas.util.testing as tm
 from pandas.util.testing import assertRaisesRegexp
 import pandas.core.common as com
@@ -97,6 +98,45 @@ def test_label_precision(self):
                      '(0.54, 0.72]']
         self.assert_numpy_array_equal(result.categories, ex_levels)
 
+    def test_label_coercion(self):
+        # GH10140
+
+        df = DataFrame({'x' : 100 * np.random.random(100)})
+        df['y'] = df.x**2
+
+        binedges = np.arange(0,110,10)
+        binlabels = np.arange(5,105,10)
+
+        # passing in an index
+        for bl, expected in [(Index(binlabels), np.dtype('int64')),
+                             (DatetimeIndex(['20130101']*len(binlabels))+TimedeltaIndex(binlabels,unit='D'),np.dtype('M8[ns]')),
+                             (TimedeltaIndex(binlabels,unit='D'),np.dtype('m8[ns]')),
+                             (Categorical(binlabels), 'category'),
+                             (Index(Index(binlabels).map(str)), 'category')]:
+            result = cut(df.x, bins=binedges, labels=bl)
+            self.assertEqual(result.dtype, expected)
+            z = df.groupby(result).y.mean()
+            self.assertEqual(z.index.dtype, expected)
+
+        # passing in a list-like
+        for bl, expected in [(Index(binlabels), np.dtype('int64')),
+                             (Index(Index(binlabels).map(str)), 'category')]:
+            bl = np.asarray(bl)
+            result = cut(df.x, bins=binedges, labels=bl)
+            self.assertEqual(result.dtype, expected)
+            z = df.groupby(result).y.mean()
+            self.assertEqual(z.index.dtype, expected)
+
+        # reversed categories
+        bl = Categorical(binlabels,categories=binlabels[::-1],ordered=True)
+        expected = Index(bl).dtype
+        result = cut(df.x, bins=binedges, labels=bl)
+        self.assertEqual(result.dtype, expected)
+        z = df.groupby(result).y.mean()
+        self.assertEqual(z.index.dtype, expected)
+        tm.assert_index_equal(z.index,
+                              CategoricalIndex(Categorical.from_codes(np.arange(len(bl)),categories=bl.categories,ordered=True),name='x'))
+
     def test_na_handling(self):
         arr = np.arange(0, 0.75, 0.01)
         arr[::3] = np.nan
diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py
@@ -2,7 +2,7 @@
 Quantilization functions and related stuff
 """
 
-from pandas.core.api import DataFrame, Series
+from pandas.core.api import DataFrame, Series, Index
 from pandas.core.categorical import Categorical
 from pandas.core.index import _ensure_index
 import pandas.core.algorithms as algos
@@ -195,6 +195,14 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
     has_nas = na_mask.any()
 
     if labels is not False:
+
+        def to_categorical(levels):
+            if com.is_categorical_dtype(levels):
+                levels = levels.categories
+            np.putmask(ids, na_mask, 0)
+            fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True)
+            return fac
+
         if labels is None:
             increases = 0
             while True:
@@ -209,15 +217,21 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
                 else:
                     break
 
+            fac = to_categorical(levels)
+
         else:
             if len(labels) != len(bins) - 1:
                 raise ValueError('Bin labels must be one fewer than '
                                  'the number of bin edges')
-            levels = labels
 
-        levels = np.asarray(levels, dtype=object)
-        np.putmask(ids, na_mask, 0)
-        fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True)
+            # we want to coerce the resultant Categorical to the binlabels type if supplied
+            # if we are passed a Categorical in the binlabels, then use this dtype
+            # 10140
+            labels = _ensure_index(labels)
+            fac = to_categorical(labels)
+            if not (com.is_object_dtype(labels) or com.is_categorical_dtype(labels)):
+                fac = type(labels)(np.asarray(fac))
+
     else:
         fac = ids - 1
         if has_nas: