Skip to content

Commit d04b965

Browse files
authored
BUG: Add unordered option to pandas.cut (#33141) (#33480)
1 parent 49daf66 commit d04b965

File tree

3 files changed

+72
-5
lines changed

3 files changed

+72
-5
lines changed

doc/source/whatsnew/v1.1.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ Other enhancements
139139
- The :meth:`DataFrame.to_feather` method now supports additional keyword
140140
arguments (e.g. to set the compression) that are added in pyarrow 0.17
141141
(:issue:`33422`).
142+
- The :func:`cut` will now accept parameter ``ordered`` with default ``ordered=True``. If ``ordered=False`` and no labels are provided, an error will be raised (:issue:`33141`)
142143
- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`,
143144
and :meth:`DataFrame.to_json` now support passing a dict of
144145
compression arguments when using the ``gzip`` and ``bz2`` protocols.
@@ -722,6 +723,7 @@ Reshaping
722723
- Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`)
723724
- :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`)
724725
- Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`)
726+
- Bug in :func:`cut` raised an error when non-unique labels (:issue:`33141`)
725727

726728

727729
Sparse

pandas/core/reshape/tile.py

+31-5
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def cut(
3838
precision: int = 3,
3939
include_lowest: bool = False,
4040
duplicates: str = "raise",
41+
ordered: bool = True,
4142
):
4243
"""
4344
Bin values into discrete intervals.
@@ -73,7 +74,7 @@ def cut(
7374
the resulting bins. If False, returns only integer indicators of the
7475
bins. This affects the type of the output container (see below).
7576
This argument is ignored when `bins` is an IntervalIndex. If True,
76-
raises an error.
77+
raises an error. When `ordered=False`, labels must be provided.
7778
retbins : bool, default False
7879
Whether to return the bins or not. Useful when bins is provided
7980
as a scalar.
@@ -85,6 +86,13 @@ def cut(
8586
If bin edges are not unique, raise ValueError or drop non-uniques.
8687
8788
.. versionadded:: 0.23.0
89+
ordered : bool, default True
90+
Whether the labels are ordered or not. Applies to returned types
91+
Categorical and Series (with Categorical dtype). If True,
92+
the resulting categorical will be ordered. If False, the resulting
93+
categorical will be unordered (labels must be provided).
94+
95+
.. versionadded:: 1.1.0
8896
8997
Returns
9098
-------
@@ -145,6 +153,14 @@ def cut(
145153
[bad, good, medium, medium, good, bad]
146154
Categories (3, object): [bad < medium < good]
147155
156+
``ordered=False`` will result in unordered categories when labels are passed.
157+
This parameter can be used to allow non-unique labels:
158+
159+
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
160+
... labels=["B", "A", "B"], ordered=False)
161+
[B, B, A, A, B, B]
162+
Categories (2, object): [A, B]
163+
148164
``labels=False`` implies you just want the bins back.
149165
150166
>>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
@@ -265,6 +281,7 @@ def cut(
265281
include_lowest=include_lowest,
266282
dtype=dtype,
267283
duplicates=duplicates,
284+
ordered=ordered,
268285
)
269286

270287
return _postprocess_for_cut(fac, bins, retbins, dtype, original)
@@ -362,7 +379,10 @@ def _bins_to_cuts(
362379
include_lowest: bool = False,
363380
dtype=None,
364381
duplicates: str = "raise",
382+
ordered: bool = True,
365383
):
384+
if not ordered and not labels:
385+
raise ValueError("'labels' must be provided if 'ordered = False'")
366386

367387
if duplicates not in ["raise", "drop"]:
368388
raise ValueError(
@@ -405,16 +425,22 @@ def _bins_to_cuts(
405425
labels = _format_labels(
406426
bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
407427
)
408-
428+
elif ordered and len(set(labels)) != len(labels):
429+
raise ValueError(
430+
"labels must be unique if ordered=True; pass ordered=False for duplicate labels" # noqa
431+
)
409432
else:
410433
if len(labels) != len(bins) - 1:
411434
raise ValueError(
412435
"Bin labels must be one fewer than the number of bin edges"
413436
)
414-
415437
if not is_categorical_dtype(labels):
416-
labels = Categorical(labels, categories=labels, ordered=True)
417-
438+
labels = Categorical(
439+
labels,
440+
categories=labels if len(set(labels)) == len(labels) else None,
441+
ordered=ordered,
442+
)
443+
# TODO: handle mismach between categorical label order and pandas.cut order.
418444
np.putmask(ids, na_mask, 0)
419445
result = algos.take_nd(labels, ids - 1)
420446

pandas/tests/reshape/test_cut.py

+39
Original file line numberDiff line numberDiff line change
@@ -625,3 +625,42 @@ def test_cut_nullable_integer(bins, right, include_lowest):
625625
)
626626
expected = cut(a, bins, right=right, include_lowest=include_lowest)
627627
tm.assert_categorical_equal(result, expected)
628+
629+
630+
@pytest.mark.parametrize(
631+
"data, bins, labels, expected_codes, expected_labels",
632+
[
633+
([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"], [0, 1, 0], ["A", "B"]),
634+
([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2], [2, 0, 1], [0, 1, 2]),
635+
],
636+
)
637+
def test_cut_non_unique_labels(data, bins, labels, expected_codes, expected_labels):
638+
# GH 33141
639+
result = cut(data, bins=bins, labels=labels, ordered=False)
640+
expected = Categorical.from_codes(
641+
expected_codes, categories=expected_labels, ordered=False
642+
)
643+
tm.assert_categorical_equal(result, expected)
644+
645+
646+
@pytest.mark.parametrize(
647+
"data, bins, labels, expected_codes, expected_labels",
648+
[
649+
([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"], [0, 1, 2], ["C", "B", "A"]),
650+
([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2], [0, 1, 2], [3, 0, 1, 2]),
651+
],
652+
)
653+
def test_cut_unordered_labels(data, bins, labels, expected_codes, expected_labels):
654+
# GH 33141
655+
result = cut(data, bins=bins, labels=labels, ordered=False)
656+
expected = Categorical.from_codes(
657+
expected_codes, categories=expected_labels, ordered=False
658+
)
659+
tm.assert_categorical_equal(result, expected)
660+
661+
662+
def test_cut_unordered_with_missing_labels_raises_error():
663+
# GH 33141
664+
msg = "'labels' must be provided if 'ordered = False'"
665+
with pytest.raises(ValueError, match=msg):
666+
cut([0.5, 3], bins=[0, 1, 2], ordered=False)

0 commit comments

Comments
 (0)