diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 597a0c5386cf0..8650eb24f615d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -139,6 +139,7 @@ Other enhancements - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). +- The :func:`cut` will now accept parameter ``ordered`` with default ``ordered=True``. If ``ordered=False`` and no labels are provided, an error will be raised (:issue:`33141`) - :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`, and :meth:`DataFrame.to_json` now support passing a dict of compression arguments when using the ``gzip`` and ``bz2`` protocols. @@ -720,6 +721,7 @@ Reshaping - Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`) - :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`) - Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) +- Bug in :func:`cut` raised an error when non-unique labels (:issue:`33141`) Sparse diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 66c2f5c9b927f..345239eeb2372 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -38,6 +38,7 @@ def cut( precision: int = 3, include_lowest: bool = False, duplicates: str = "raise", + ordered: bool = True, ): """ Bin values into discrete intervals. @@ -73,7 +74,7 @@ def cut( the resulting bins. If False, returns only integer indicators of the bins. This affects the type of the output container (see below). This argument is ignored when `bins` is an IntervalIndex. If True, - raises an error. + raises an error. When `ordered=False`, labels must be provided. retbins : bool, default False Whether to return the bins or not. Useful when bins is provided as a scalar. @@ -85,6 +86,13 @@ def cut( If bin edges are not unique, raise ValueError or drop non-uniques. .. versionadded:: 0.23.0 + ordered : bool, default True + Whether the labels are ordered or not. Applies to returned types + Categorical and Series (with Categorical dtype). If True, + the resulting categorical will be ordered. If False, the resulting + categorical will be unordered (labels must be provided). + + .. versionadded:: 1.1.0 Returns ------- @@ -145,6 +153,14 @@ def cut( [bad, good, medium, medium, good, bad] Categories (3, object): [bad < medium < good] + ``ordered=False`` will result in unordered categories when labels are passed. + This parameter can be used to allow non-unique labels: + + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, + ... labels=["B", "A", "B"], ordered=False) + [B, B, A, A, B, B] + Categories (2, object): [A, B] + ``labels=False`` implies you just want the bins back. >>> pd.cut([0, 1, 1, 2], bins=4, labels=False) @@ -265,6 +281,7 @@ def cut( include_lowest=include_lowest, dtype=dtype, duplicates=duplicates, + ordered=ordered, ) return _postprocess_for_cut(fac, bins, retbins, dtype, original) @@ -362,7 +379,10 @@ def _bins_to_cuts( include_lowest: bool = False, dtype=None, duplicates: str = "raise", + ordered: bool = True, ): + if not ordered and not labels: + raise ValueError("'labels' must be provided if 'ordered = False'") if duplicates not in ["raise", "drop"]: raise ValueError( @@ -405,16 +425,22 @@ def _bins_to_cuts( labels = _format_labels( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) - + elif ordered and len(set(labels)) != len(labels): + raise ValueError( + "labels must be unique if ordered=True; pass ordered=False for duplicate labels" # noqa + ) else: if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) - if not is_categorical_dtype(labels): - labels = Categorical(labels, categories=labels, ordered=True) - + labels = Categorical( + labels, + categories=labels if len(set(labels)) == len(labels) else None, + ordered=ordered, + ) + # TODO: handle mismach between categorical label order and pandas.cut order. np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 830e786fd1c6d..60c80a8abdba6 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -625,3 +625,42 @@ def test_cut_nullable_integer(bins, right, include_lowest): ) expected = cut(a, bins, right=right, include_lowest=include_lowest) tm.assert_categorical_equal(result, expected) + + +@pytest.mark.parametrize( + "data, bins, labels, expected_codes, expected_labels", + [ + ([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"], [0, 1, 0], ["A", "B"]), + ([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2], [2, 0, 1], [0, 1, 2]), + ], +) +def test_cut_non_unique_labels(data, bins, labels, expected_codes, expected_labels): + # GH 33141 + result = cut(data, bins=bins, labels=labels, ordered=False) + expected = Categorical.from_codes( + expected_codes, categories=expected_labels, ordered=False + ) + tm.assert_categorical_equal(result, expected) + + +@pytest.mark.parametrize( + "data, bins, labels, expected_codes, expected_labels", + [ + ([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"], [0, 1, 2], ["C", "B", "A"]), + ([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2], [0, 1, 2], [3, 0, 1, 2]), + ], +) +def test_cut_unordered_labels(data, bins, labels, expected_codes, expected_labels): + # GH 33141 + result = cut(data, bins=bins, labels=labels, ordered=False) + expected = Categorical.from_codes( + expected_codes, categories=expected_labels, ordered=False + ) + tm.assert_categorical_equal(result, expected) + + +def test_cut_unordered_with_missing_labels_raises_error(): + # GH 33141 + msg = "'labels' must be provided if 'ordered = False'" + with pytest.raises(ValueError, match=msg): + cut([0.5, 3], bins=[0, 1, 2], ordered=False)