pandas-dev · jreback · May 1, 2020 · Apr 11, 2020 · Apr 11, 2020 · Apr 14, 2020
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -139,6 +139,7 @@ Other enhancements
 - The :meth:`DataFrame.to_feather` method now supports additional keyword
   arguments (e.g. to set the compression) that are added in pyarrow 0.17
   (:issue:`33422`).
+- The :func:`cut` will now accept parameter ``ordered`` with default ``ordered=True``. If ``ordered=False`` and no labels are provided, an error will be raised (:issue:`33141`)
 - :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`,
   and :meth:`DataFrame.to_json` now support passing a dict of
   compression arguments when using the ``gzip`` and ``bz2`` protocols.
@@ -720,6 +721,7 @@ Reshaping
 - Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`)
 - :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`)
 - Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`)
+- Bug in :func:`cut` raised an error when non-unique labels (:issue:`33141`)
 
 
 Sparse

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -38,6 +38,7 @@ def cut(
     precision: int = 3,
     include_lowest: bool = False,
     duplicates: str = "raise",
+    ordered: bool = True,
 ):
     """
     Bin values into discrete intervals.
@@ -73,7 +74,7 @@ def cut(
         the resulting bins. If False, returns only integer indicators of the
         bins. This affects the type of the output container (see below).
         This argument is ignored when `bins` is an IntervalIndex. If True,
-        raises an error.
+        raises an error. When `ordered=False`, labels must be provided.
     retbins : bool, default False
         Whether to return the bins or not. Useful when bins is provided
         as a scalar.
@@ -85,6 +86,13 @@ def cut(
         If bin edges are not unique, raise ValueError or drop non-uniques.
 
         .. versionadded:: 0.23.0
+    ordered : bool, default True
+        Whether the labels are ordered or not. Applies to returned types
+        Categorical and Series (with Categorical dtype). If True,
+        the resulting categorical will be ordered. If False, the resulting
+        categorical will be unordered (labels must be provided).
+
+        .. versionadded:: 1.1.0
 
     Returns
     -------
@@ -145,6 +153,14 @@ def cut(
     [bad, good, medium, medium, good, bad]
     Categories (3, object): [bad < medium < good]
 
+    ``ordered=False`` will result in unordered categories when labels are passed.
+    This parameter can be used to allow non-unique labels:
+
+    >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
+    ...        labels=["B", "A", "B"], ordered=False)
+    [B, B, A, A, B, B]
+    Categories (2, object): [A, B]
+
     ``labels=False`` implies you just want the bins back.
 
     >>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
@@ -265,6 +281,7 @@ def cut(
         include_lowest=include_lowest,
         dtype=dtype,
         duplicates=duplicates,
+        ordered=ordered,
     )
 
     return _postprocess_for_cut(fac, bins, retbins, dtype, original)
@@ -362,7 +379,10 @@ def _bins_to_cuts(
     include_lowest: bool = False,
     dtype=None,
     duplicates: str = "raise",
+    ordered: bool = True,
 ):
+    if not ordered and not labels:
+        raise ValueError("'labels' must be provided if 'ordered = False'")
 
     if duplicates not in ["raise", "drop"]:
         raise ValueError(
@@ -405,16 +425,22 @@ def _bins_to_cuts(
             labels = _format_labels(
                 bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
             )
-
+        elif ordered and len(set(labels)) != len(labels):
+            raise ValueError(
+                "labels must be unique if ordered=True; pass ordered=False for duplicate labels"  # noqa
+            )
         else:
             if len(labels) != len(bins) - 1:
                 raise ValueError(
                     "Bin labels must be one fewer than the number of bin edges"
                 )
-
         if not is_categorical_dtype(labels):
-            labels = Categorical(labels, categories=labels, ordered=True)
-
+            labels = Categorical(
+                labels,
+                categories=labels if len(set(labels)) == len(labels) else None,
+                ordered=ordered,
+            )
+        # TODO: handle mismach between categorical label order and pandas.cut order.
         np.putmask(ids, na_mask, 0)
         result = algos.take_nd(labels, ids - 1)
 

diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py
@@ -625,3 +625,42 @@ def test_cut_nullable_integer(bins, right, include_lowest):
     )
     expected = cut(a, bins, right=right, include_lowest=include_lowest)
     tm.assert_categorical_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data, bins, labels, expected_codes, expected_labels",
+    [
+        ([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"], [0, 1, 0], ["A", "B"]),
+        ([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2], [2, 0, 1], [0, 1, 2]),
+    ],
+)
+def test_cut_non_unique_labels(data, bins, labels, expected_codes, expected_labels):
+    # GH 33141
+    result = cut(data, bins=bins, labels=labels, ordered=False)
+    expected = Categorical.from_codes(
+        expected_codes, categories=expected_labels, ordered=False
+    )
+    tm.assert_categorical_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data, bins, labels, expected_codes, expected_labels",
+    [
+        ([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"], [0, 1, 2], ["C", "B", "A"]),
+        ([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2], [0, 1, 2], [3, 0, 1, 2]),
+    ],
+)
+def test_cut_unordered_labels(data, bins, labels, expected_codes, expected_labels):
+    # GH 33141
+    result = cut(data, bins=bins, labels=labels, ordered=False)
+    expected = Categorical.from_codes(
+        expected_codes, categories=expected_labels, ordered=False
+    )
+    tm.assert_categorical_equal(result, expected)
+
+
+def test_cut_unordered_with_missing_labels_raises_error():
+    # GH 33141
+    msg = "'labels' must be provided if 'ordered = False'"
+    with pytest.raises(ValueError, match=msg):
+        cut([0.5, 3], bins=[0, 1, 2], ordered=False)