Skip to content

Commit 0b9bb41

Browse files
committed
REGR: ensure passed binlabels to pd.cut have a compat dtype on output (#10140)
1 parent 676cb95 commit 0b9bb41

File tree

5 files changed

+78
-8
lines changed

5 files changed

+78
-8
lines changed

doc/source/groupby.rst

+8-1
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,8 @@ excluded. So there will never be an "NA group" or "NaT group". This was not the
792792
versions of pandas, but users were generally discarding the NA group anyway
793793
(and supporting it was an implementation headache).
794794

795+
.. _groupby.categorical:
796+
795797
Grouping with ordered factors
796798
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
797799

@@ -801,9 +803,14 @@ can be used as group keys. If so, the order of the levels will be preserved:
801803
.. ipython:: python
802804
803805
data = Series(np.random.randn(100))
804-
805806
factor = qcut(data, [0, .25, .5, .75, 1.])
807+
data.groupby(factor).mean()
808+
809+
Further, one can name the bins to produce a custom-labeled binner.
810+
811+
.. ipython:: python
806812
813+
factor = qcut(data, [0, .25, .5, .75, 1.], labels=['small','medium','large','x-large'])
807814
data.groupby(factor).mean()
808815
809816
.. _groupby.specify:

doc/source/reshaping.rst

+9-1
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,8 @@ Tiling
417417

418418
The ``cut`` function computes groupings for the values of the input array and
419419
is often used to transform continuous variables to discrete or categorical
420-
variables:
420+
variables. These will result in a ``Categorical`` dtype, where the categories
421+
are the bins.
421422

422423
.. ipython:: python
423424
@@ -433,6 +434,13 @@ Alternatively we can specify custom bin-edges:
433434
434435
pd.cut(ages, bins=[0, 18, 35, 70])
435436
437+
Furthermore, one can specify ``labels`` to have custom labels.
438+
439+
.. ipython:: python
440+
441+
pd.cut(ages, bins=[0, 18, 35, 70], labels=['child','adult','senior'])
442+
443+
``cut/qcut`` are often used as groupers, see the :ref:`grouping with ordered factors<groupby.categorical>` for more.
436444

437445
.. _reshaping.dummies:
438446

doc/source/whatsnew/v0.16.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ Bug Fixes
6969
- Bung in ``Series`` arithmetic methods may incorrectly hold names (:issue:`10068`)
7070

7171
- Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`)
72+
- Regression in ``pd.cut` to ensure passed ``binlabels`` have a compat dtype on output (:issue:`10140`)
7273

7374

7475
- Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`)

pandas/tools/tests/test_tile.py

+41-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import numpy as np
55
from pandas.compat import zip
66

7-
from pandas import DataFrame, Series, unique
7+
from pandas import (DataFrame, Series, unique, Index, Categorical, CategoricalIndex,
8+
DatetimeIndex, TimedeltaIndex)
89
import pandas.util.testing as tm
910
from pandas.util.testing import assertRaisesRegexp
1011
import pandas.core.common as com
@@ -97,6 +98,45 @@ def test_label_precision(self):
9798
'(0.54, 0.72]']
9899
self.assert_numpy_array_equal(result.categories, ex_levels)
99100

101+
def test_label_coercion(self):
102+
# GH10140
103+
104+
df = DataFrame({'x' : 100 * np.random.random(100)})
105+
df['y'] = df.x**2
106+
107+
binedges = np.arange(0,110,10)
108+
binlabels = np.arange(5,105,10)
109+
110+
# passing in an index
111+
for bl, expected in [(Index(binlabels), np.dtype('int64')),
112+
(DatetimeIndex(['20130101']*len(binlabels))+TimedeltaIndex(binlabels,unit='D'),np.dtype('M8[ns]')),
113+
(TimedeltaIndex(binlabels,unit='D'),np.dtype('m8[ns]')),
114+
(Categorical(binlabels), 'category'),
115+
(Index(Index(binlabels).map(str)), 'category')]:
116+
result = cut(df.x, bins=binedges, labels=bl)
117+
self.assertEqual(result.dtype, expected)
118+
z = df.groupby(result).y.mean()
119+
self.assertEqual(z.index.dtype, expected)
120+
121+
# passing in a list-like
122+
for bl, expected in [(Index(binlabels), np.dtype('int64')),
123+
(Index(Index(binlabels).map(str)), 'category')]:
124+
bl = np.asarray(bl)
125+
result = cut(df.x, bins=binedges, labels=bl)
126+
self.assertEqual(result.dtype, expected)
127+
z = df.groupby(result).y.mean()
128+
self.assertEqual(z.index.dtype, expected)
129+
130+
# reversed categories
131+
bl = Categorical(binlabels,categories=binlabels[::-1],ordered=True)
132+
expected = Index(bl).dtype
133+
result = cut(df.x, bins=binedges, labels=bl)
134+
self.assertEqual(result.dtype, expected)
135+
z = df.groupby(result).y.mean()
136+
self.assertEqual(z.index.dtype, expected)
137+
tm.assert_index_equal(z.index,
138+
CategoricalIndex(Categorical.from_codes(np.arange(len(bl)),categories=bl.categories,ordered=True),name='x'))
139+
100140
def test_na_handling(self):
101141
arr = np.arange(0, 0.75, 0.01)
102142
arr[::3] = np.nan

pandas/tools/tile.py

+19-5
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Quantilization functions and related stuff
33
"""
44

5-
from pandas.core.api import DataFrame, Series
5+
from pandas.core.api import DataFrame, Series, Index
66
from pandas.core.categorical import Categorical
77
from pandas.core.index import _ensure_index
88
import pandas.core.algorithms as algos
@@ -195,6 +195,14 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
195195
has_nas = na_mask.any()
196196

197197
if labels is not False:
198+
199+
def to_categorical(levels):
200+
if com.is_categorical_dtype(levels):
201+
levels = levels.categories
202+
np.putmask(ids, na_mask, 0)
203+
fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True)
204+
return fac
205+
198206
if labels is None:
199207
increases = 0
200208
while True:
@@ -209,15 +217,21 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
209217
else:
210218
break
211219

220+
fac = to_categorical(levels)
221+
212222
else:
213223
if len(labels) != len(bins) - 1:
214224
raise ValueError('Bin labels must be one fewer than '
215225
'the number of bin edges')
216-
levels = labels
217226

218-
levels = np.asarray(levels, dtype=object)
219-
np.putmask(ids, na_mask, 0)
220-
fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True)
227+
# we want to coerce the resultant Categorical to the binlabels type if supplied
228+
# if we are passed a Categorical in the binlabels, then use this dtype
229+
# 10140
230+
labels = _ensure_index(labels)
231+
fac = to_categorical(labels)
232+
if not (com.is_object_dtype(labels) or com.is_categorical_dtype(labels)):
233+
fac = type(labels)(np.asarray(fac))
234+
221235
else:
222236
fac = ids - 1
223237
if has_nas:

0 commit comments

Comments
 (0)