Skip to content

Commit b9cb99e

Browse files
committed
BUG: Add unordered option to pandas.cut. Added tests. Issue: Pandas cut raises error if labels are non-unique (#33141)
1 parent a00202d commit b9cb99e

File tree

3 files changed

+53
-1
lines changed

3 files changed

+53
-1
lines changed

doc/source/whatsnew/v1.1.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,8 @@ Reshaping
540540
- Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`)
541541
- :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`)
542542
- Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`)
543+
- Bug in :func:`cut` when non-unique labels used an error raised. Added input parameter ``ordered`` to :func:`cut` with default (``ordered=True``).
544+
If ``ordered=False`` and no labels are provided, an error will be raised (:issue:33141`)
543545

544546

545547
Sparse

pandas/core/reshape/tile.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def cut(
3838
precision: int = 3,
3939
include_lowest: bool = False,
4040
duplicates: str = "raise",
41+
ordered: bool = True,
4142
):
4243
"""
4344
Bin values into discrete intervals.
@@ -265,6 +266,7 @@ def cut(
265266
include_lowest=include_lowest,
266267
dtype=dtype,
267268
duplicates=duplicates,
269+
ordered=ordered,
268270
)
269271

270272
return _postprocess_for_cut(fac, bins, retbins, dtype, original)
@@ -362,7 +364,10 @@ def _bins_to_cuts(
362364
include_lowest: bool = False,
363365
dtype=None,
364366
duplicates: str = "raise",
367+
ordered: bool = True,
365368
):
369+
if not ordered and not labels:
370+
raise ValueError("'labels' must be provided if 'ordered = False'")
366371

367372
if duplicates not in ["raise", "drop"]:
368373
raise ValueError(
@@ -413,7 +418,10 @@ def _bins_to_cuts(
413418
)
414419

415420
if not is_categorical_dtype(labels):
416-
labels = Categorical(labels, categories=labels, ordered=True)
421+
if len(set(labels)) == len(labels):
422+
labels = Categorical(labels, categories=labels, ordered=ordered)
423+
else:
424+
labels = Categorical(labels, ordered=ordered)
417425

418426
np.putmask(ids, na_mask, 0)
419427
result = algos.take_nd(labels, ids - 1)

pandas/tests/reshape/test_cut.py

+42
Original file line numberDiff line numberDiff line change
@@ -625,3 +625,45 @@ def test_cut_nullable_integer(bins, right, include_lowest):
625625
)
626626
expected = cut(a, bins, right=right, include_lowest=include_lowest)
627627
tm.assert_categorical_equal(result, expected)
628+
629+
630+
@pytest.mark.parametrize(
631+
"data, bins, labels",
632+
[
633+
([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"]),
634+
([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2]),
635+
],
636+
)
637+
def test_cut_non_unique_labels(data, bins, labels):
638+
result = cut(data, bins=bins, labels=labels, ordered=False)
639+
expected = cut(
640+
data, bins=bins, labels=Categorical(labels, ordered=False), ordered=False
641+
)
642+
tm.assert_categorical_equal(result, expected)
643+
644+
645+
@pytest.mark.parametrize(
646+
"data, bins, labels",
647+
[
648+
([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"]),
649+
([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2]),
650+
],
651+
)
652+
def test_cut_unordered_labels(data, bins, labels):
653+
result = cut(data, bins=bins, labels=labels, ordered=False)
654+
expected = cut(
655+
data,
656+
bins=bins,
657+
labels=Categorical(labels, categories=labels, ordered=False),
658+
ordered=False,
659+
)
660+
print(result, "-" * 10, "\n")
661+
print(expected, "*" * 10, "\n")
662+
tm.assert_categorical_equal(result, expected)
663+
664+
665+
@pytest.mark.parametrize("data, bins,", [([0.5, 3], [0, 1, 2])])
666+
def test_cut_unordered_with_missing_labels_raises_error(data, bins):
667+
msg = "'labels' must be provided if 'ordered = False'"
668+
with pytest.raises(ValueError, match=msg):
669+
cut(data, bins=bins, ordered=False)

0 commit comments

Comments
 (0)