diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index c9e18b585c764..54e35a489d6ba 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -792,6 +792,8 @@ excluded. So there will never be an "NA group" or "NaT group". This was not the versions of pandas, but users were generally discarding the NA group anyway (and supporting it was an implementation headache). +.. _groupby.categorical: + Grouping with ordered factors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -801,9 +803,14 @@ can be used as group keys. If so, the order of the levels will be preserved: .. ipython:: python data = Series(np.random.randn(100)) - factor = qcut(data, [0, .25, .5, .75, 1.]) + data.groupby(factor).mean() + +Further, one can name the bins to produce a custom-labeled binner. + +.. ipython:: python + factor = qcut(data, [0, .25, .5, .75, 1.], labels=['small','medium','large','x-large']) data.groupby(factor).mean() .. _groupby.specify: diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 26aaf9c2be69d..625613f0a5739 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -417,7 +417,8 @@ Tiling The ``cut`` function computes groupings for the values of the input array and is often used to transform continuous variables to discrete or categorical -variables: +variables. These will result in a ``Categorical`` dtype, where the categories +are the bins. .. ipython:: python @@ -433,6 +434,13 @@ Alternatively we can specify custom bin-edges: pd.cut(ages, bins=[0, 18, 35, 70]) +Furthermore, one can specify ``labels`` to have custom labels. + +.. ipython:: python + + pd.cut(ages, bins=[0, 18, 35, 70], labels=['child','adult','senior']) + +``cut/qcut`` are often used as groupers, see the :ref:`grouping with ordered factors` for more. .. _reshaping.dummies: diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index b571aab0b19a5..31636068e43f8 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -69,6 +69,7 @@ Bug Fixes - Bung in ``Series`` arithmetic methods may incorrectly hold names (:issue:`10068`) - Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`) +- Regression in ``pd.cut` to ensure passed ``binlabels`` have a compat dtype on output (:issue:`10140`) - Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 4a0218bef6001..788cebad4c244 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -4,7 +4,8 @@ import numpy as np from pandas.compat import zip -from pandas import DataFrame, Series, unique +from pandas import (DataFrame, Series, unique, Index, Categorical, CategoricalIndex, + DatetimeIndex, TimedeltaIndex) import pandas.util.testing as tm from pandas.util.testing import assertRaisesRegexp import pandas.core.common as com @@ -97,6 +98,45 @@ def test_label_precision(self): '(0.54, 0.72]'] self.assert_numpy_array_equal(result.categories, ex_levels) + def test_label_coercion(self): + # GH10140 + + df = DataFrame({'x' : 100 * np.random.random(100)}) + df['y'] = df.x**2 + + binedges = np.arange(0,110,10) + binlabels = np.arange(5,105,10) + + # passing in an index + for bl, expected in [(Index(binlabels), np.dtype('int64')), + (DatetimeIndex(['20130101']*len(binlabels))+TimedeltaIndex(binlabels,unit='D'),np.dtype('M8[ns]')), + (TimedeltaIndex(binlabels,unit='D'),np.dtype('m8[ns]')), + (Categorical(binlabels), 'category'), + (Index(Index(binlabels).map(str)), 'category')]: + result = cut(df.x, bins=binedges, labels=bl) + self.assertEqual(result.dtype, expected) + z = df.groupby(result).y.mean() + self.assertEqual(z.index.dtype, expected) + + # passing in a list-like + for bl, expected in [(Index(binlabels), np.dtype('int64')), + (Index(Index(binlabels).map(str)), 'category')]: + bl = np.asarray(bl) + result = cut(df.x, bins=binedges, labels=bl) + self.assertEqual(result.dtype, expected) + z = df.groupby(result).y.mean() + self.assertEqual(z.index.dtype, expected) + + # reversed categories + bl = Categorical(binlabels,categories=binlabels[::-1],ordered=True) + expected = Index(bl).dtype + result = cut(df.x, bins=binedges, labels=bl) + self.assertEqual(result.dtype, expected) + z = df.groupby(result).y.mean() + self.assertEqual(z.index.dtype, expected) + tm.assert_index_equal(z.index, + CategoricalIndex(Categorical.from_codes(np.arange(len(bl)),categories=bl.categories,ordered=True),name='x')) + def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) arr[::3] = np.nan diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 6830919d9c09f..519eaaebf6f3e 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -2,7 +2,7 @@ Quantilization functions and related stuff """ -from pandas.core.api import DataFrame, Series +from pandas.core.api import DataFrame, Series, Index from pandas.core.categorical import Categorical from pandas.core.index import _ensure_index import pandas.core.algorithms as algos @@ -195,6 +195,14 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, has_nas = na_mask.any() if labels is not False: + + def to_categorical(levels): + if com.is_categorical_dtype(levels): + levels = levels.categories + np.putmask(ids, na_mask, 0) + fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True) + return fac + if labels is None: increases = 0 while True: @@ -209,15 +217,21 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, else: break + fac = to_categorical(levels) + else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') - levels = labels - levels = np.asarray(levels, dtype=object) - np.putmask(ids, na_mask, 0) - fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True) + # we want to coerce the resultant Categorical to the binlabels type if supplied + # if we are passed a Categorical in the binlabels, then use this dtype + # 10140 + labels = _ensure_index(labels) + fac = to_categorical(labels) + if not (com.is_object_dtype(labels) or com.is_categorical_dtype(labels)): + fac = type(labels)(np.asarray(fac)) + else: fac = ids - 1 if has_nas: