Skip to content

BUG: Add unordered option to pandas.cut (#33141) #33480

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
May 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ Other enhancements
- The :meth:`DataFrame.to_feather` method now supports additional keyword
arguments (e.g. to set the compression) that are added in pyarrow 0.17
(:issue:`33422`).
- The :func:`cut` will now accept parameter ``ordered`` with default ``ordered=True``. If ``ordered=False`` and no labels are provided, an error will be raised (:issue:`33141`)
- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`,
and :meth:`DataFrame.to_json` now support passing a dict of
compression arguments when using the ``gzip`` and ``bz2`` protocols.
Expand Down Expand Up @@ -720,6 +721,7 @@ Reshaping
- Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`)
- :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`)
- Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`)
- Bug in :func:`cut` raised an error when non-unique labels (:issue:`33141`)


Sparse
Expand Down
36 changes: 31 additions & 5 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def cut(
precision: int = 3,
include_lowest: bool = False,
duplicates: str = "raise",
ordered: bool = True,
):
"""
Bin values into discrete intervals.
Expand Down Expand Up @@ -73,7 +74,7 @@ def cut(
the resulting bins. If False, returns only integer indicators of the
bins. This affects the type of the output container (see below).
This argument is ignored when `bins` is an IntervalIndex. If True,
raises an error.
raises an error. When `ordered=False`, labels must be provided.
retbins : bool, default False
Whether to return the bins or not. Useful when bins is provided
as a scalar.
Expand All @@ -85,6 +86,13 @@ def cut(
If bin edges are not unique, raise ValueError or drop non-uniques.

.. versionadded:: 0.23.0
ordered : bool, default True
Whether the labels are ordered or not. Applies to returned types
Categorical and Series (with Categorical dtype). If True,
the resulting categorical will be ordered. If False, the resulting
categorical will be unordered (labels must be provided).

.. versionadded:: 1.1.0

Returns
-------
Expand Down Expand Up @@ -145,6 +153,14 @@ def cut(
[bad, good, medium, medium, good, bad]
Categories (3, object): [bad < medium < good]

``ordered=False`` will result in unordered categories when labels are passed.
This parameter can be used to allow non-unique labels:

>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
... labels=["B", "A", "B"], ordered=False)
[B, B, A, A, B, B]
Categories (2, object): [A, B]

``labels=False`` implies you just want the bins back.

>>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
Expand Down Expand Up @@ -265,6 +281,7 @@ def cut(
include_lowest=include_lowest,
dtype=dtype,
duplicates=duplicates,
ordered=ordered,
)

return _postprocess_for_cut(fac, bins, retbins, dtype, original)
Expand Down Expand Up @@ -362,7 +379,10 @@ def _bins_to_cuts(
include_lowest: bool = False,
dtype=None,
duplicates: str = "raise",
ordered: bool = True,
):
if not ordered and not labels:
raise ValueError("'labels' must be provided if 'ordered = False'")

if duplicates not in ["raise", "drop"]:
raise ValueError(
Expand Down Expand Up @@ -405,16 +425,22 @@ def _bins_to_cuts(
labels = _format_labels(
bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
)

elif ordered and len(set(labels)) != len(labels):
raise ValueError(
"labels must be unique if ordered=True; pass ordered=False for duplicate labels" # noqa
)
else:
if len(labels) != len(bins) - 1:
raise ValueError(
"Bin labels must be one fewer than the number of bin edges"
)

if not is_categorical_dtype(labels):
labels = Categorical(labels, categories=labels, ordered=True)

labels = Categorical(
labels,
categories=labels if len(set(labels)) == len(labels) else None,
ordered=ordered,
)
# TODO: handle mismach between categorical label order and pandas.cut order.
np.putmask(ids, na_mask, 0)
result = algos.take_nd(labels, ids - 1)

Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/reshape/test_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,3 +625,42 @@ def test_cut_nullable_integer(bins, right, include_lowest):
)
expected = cut(a, bins, right=right, include_lowest=include_lowest)
tm.assert_categorical_equal(result, expected)


@pytest.mark.parametrize(
"data, bins, labels, expected_codes, expected_labels",
[
([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"], [0, 1, 0], ["A", "B"]),
([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2], [2, 0, 1], [0, 1, 2]),
],
)
def test_cut_non_unique_labels(data, bins, labels, expected_codes, expected_labels):
# GH 33141
result = cut(data, bins=bins, labels=labels, ordered=False)
expected = Categorical.from_codes(
expected_codes, categories=expected_labels, ordered=False
)
tm.assert_categorical_equal(result, expected)


@pytest.mark.parametrize(
"data, bins, labels, expected_codes, expected_labels",
[
([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"], [0, 1, 2], ["C", "B", "A"]),
([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2], [0, 1, 2], [3, 0, 1, 2]),
],
)
def test_cut_unordered_labels(data, bins, labels, expected_codes, expected_labels):
# GH 33141
result = cut(data, bins=bins, labels=labels, ordered=False)
expected = Categorical.from_codes(
expected_codes, categories=expected_labels, ordered=False
)
tm.assert_categorical_equal(result, expected)


def test_cut_unordered_with_missing_labels_raises_error():
# GH 33141
msg = "'labels' must be provided if 'ordered = False'"
with pytest.raises(ValueError, match=msg):
cut([0.5, 3], bins=[0, 1, 2], ordered=False)