Skip to content

REGR: ensure passed binlabels to pd.cut have a compat dtype on output (#10140) #10252

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,8 @@ excluded. So there will never be an "NA group" or "NaT group". This was not the
versions of pandas, but users were generally discarding the NA group anyway
(and supporting it was an implementation headache).

.. _groupby.categorical:

Grouping with ordered factors
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand All @@ -801,9 +803,14 @@ can be used as group keys. If so, the order of the levels will be preserved:
.. ipython:: python

data = Series(np.random.randn(100))

factor = qcut(data, [0, .25, .5, .75, 1.])
data.groupby(factor).mean()

Further, one can name the bins to produce a custom-labeled binner.

.. ipython:: python

factor = qcut(data, [0, .25, .5, .75, 1.], labels=['small','medium','large','x-large'])
data.groupby(factor).mean()

.. _groupby.specify:
Expand Down
10 changes: 9 additions & 1 deletion doc/source/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,8 @@ Tiling

The ``cut`` function computes groupings for the values of the input array and
is often used to transform continuous variables to discrete or categorical
variables:
variables. These will result in a ``Categorical`` dtype, where the categories
are the bins.

.. ipython:: python

Expand All @@ -433,6 +434,13 @@ Alternatively we can specify custom bin-edges:

pd.cut(ages, bins=[0, 18, 35, 70])

Furthermore, one can specify ``labels`` to have custom labels.

.. ipython:: python

pd.cut(ages, bins=[0, 18, 35, 70], labels=['child','adult','senior'])

``cut/qcut`` are often used as groupers, see the :ref:`grouping with ordered factors<groupby.categorical>` for more.

.. _reshaping.dummies:

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ Bug Fixes
- Bung in ``Series`` arithmetic methods may incorrectly hold names (:issue:`10068`)

- Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`)
- Regression in ``pd.cut` to ensure passed ``binlabels`` have a compat dtype on output (:issue:`10140`)


- Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`)
Expand Down
42 changes: 41 additions & 1 deletion pandas/tools/tests/test_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import numpy as np
from pandas.compat import zip

from pandas import DataFrame, Series, unique
from pandas import (DataFrame, Series, unique, Index, Categorical, CategoricalIndex,
DatetimeIndex, TimedeltaIndex)
import pandas.util.testing as tm
from pandas.util.testing import assertRaisesRegexp
import pandas.core.common as com
Expand Down Expand Up @@ -97,6 +98,45 @@ def test_label_precision(self):
'(0.54, 0.72]']
self.assert_numpy_array_equal(result.categories, ex_levels)

def test_label_coercion(self):
# GH10140

df = DataFrame({'x' : 100 * np.random.random(100)})
df['y'] = df.x**2

binedges = np.arange(0,110,10)
binlabels = np.arange(5,105,10)

# passing in an index
for bl, expected in [(Index(binlabels), np.dtype('int64')),
(DatetimeIndex(['20130101']*len(binlabels))+TimedeltaIndex(binlabels,unit='D'),np.dtype('M8[ns]')),
(TimedeltaIndex(binlabels,unit='D'),np.dtype('m8[ns]')),
(Categorical(binlabels), 'category'),
(Index(Index(binlabels).map(str)), 'category')]:
result = cut(df.x, bins=binedges, labels=bl)
self.assertEqual(result.dtype, expected)
z = df.groupby(result).y.mean()
self.assertEqual(z.index.dtype, expected)

# passing in a list-like
for bl, expected in [(Index(binlabels), np.dtype('int64')),
(Index(Index(binlabels).map(str)), 'category')]:
bl = np.asarray(bl)
result = cut(df.x, bins=binedges, labels=bl)
self.assertEqual(result.dtype, expected)
z = df.groupby(result).y.mean()
self.assertEqual(z.index.dtype, expected)

# reversed categories
bl = Categorical(binlabels,categories=binlabels[::-1],ordered=True)
expected = Index(bl).dtype
result = cut(df.x, bins=binedges, labels=bl)
self.assertEqual(result.dtype, expected)
z = df.groupby(result).y.mean()
self.assertEqual(z.index.dtype, expected)
tm.assert_index_equal(z.index,
CategoricalIndex(Categorical.from_codes(np.arange(len(bl)),categories=bl.categories,ordered=True),name='x'))

def test_na_handling(self):
arr = np.arange(0, 0.75, 0.01)
arr[::3] = np.nan
Expand Down
24 changes: 19 additions & 5 deletions pandas/tools/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Quantilization functions and related stuff
"""

from pandas.core.api import DataFrame, Series
from pandas.core.api import DataFrame, Series, Index
from pandas.core.categorical import Categorical
from pandas.core.index import _ensure_index
import pandas.core.algorithms as algos
Expand Down Expand Up @@ -195,6 +195,14 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
has_nas = na_mask.any()

if labels is not False:

def to_categorical(levels):
if com.is_categorical_dtype(levels):
levels = levels.categories
np.putmask(ids, na_mask, 0)
fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True)
return fac

if labels is None:
increases = 0
while True:
Expand All @@ -209,15 +217,21 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
else:
break

fac = to_categorical(levels)

else:
if len(labels) != len(bins) - 1:
raise ValueError('Bin labels must be one fewer than '
'the number of bin edges')
levels = labels

levels = np.asarray(levels, dtype=object)
np.putmask(ids, na_mask, 0)
fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True)
# we want to coerce the resultant Categorical to the binlabels type if supplied
# if we are passed a Categorical in the binlabels, then use this dtype
# 10140
labels = _ensure_index(labels)
fac = to_categorical(labels)
if not (com.is_object_dtype(labels) or com.is_categorical_dtype(labels)):
fac = type(labels)(np.asarray(fac))

else:
fac = ids - 1
if has_nas:
Expand Down