From 0dccd269310d23c3c2998cfc94f18691c5ecbd7f Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Sat, 11 Apr 2020 13:49:25 +0200 Subject: [PATCH 1/8] BUG: Add unordered option to pandas.cut. Added tests. Issue: Pandas cut raises error if labels are non-unique (#33141) --- doc/source/whatsnew/v1.1.0.rst | 2 ++ pandas/core/reshape/tile.py | 10 +++++++- pandas/tests/reshape/test_cut.py | 40 ++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 584e21e87390d..dfdb318d28580 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -540,6 +540,8 @@ Reshaping - Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`) - :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`) - Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) +- Bug in :func:`cut` when non-unique labels used an error raised. Added input parameter ``ordered`` to :func:`cut` with default (``ordered=True``). + If ``ordered=False`` and no labels are provided, an error will be raised (:issue:33141`) Sparse diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 66c2f5c9b927f..5150c9a0a7c67 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -38,6 +38,7 @@ def cut( precision: int = 3, include_lowest: bool = False, duplicates: str = "raise", + ordered: bool = True, ): """ Bin values into discrete intervals. @@ -265,6 +266,7 @@ def cut( include_lowest=include_lowest, dtype=dtype, duplicates=duplicates, + ordered=ordered, ) return _postprocess_for_cut(fac, bins, retbins, dtype, original) @@ -362,7 +364,10 @@ def _bins_to_cuts( include_lowest: bool = False, dtype=None, duplicates: str = "raise", + ordered: bool = True, ): + if not ordered and not labels: + raise ValueError("'labels' must be provided if 'ordered = False'") if duplicates not in ["raise", "drop"]: raise ValueError( @@ -413,7 +418,10 @@ def _bins_to_cuts( ) if not is_categorical_dtype(labels): - labels = Categorical(labels, categories=labels, ordered=True) + if len(set(labels)) == len(labels): + labels = Categorical(labels, categories=labels, ordered=ordered) + else: + labels = Categorical(labels, ordered=ordered) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 830e786fd1c6d..1cc9b7a3cf7fe 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -625,3 +625,43 @@ def test_cut_nullable_integer(bins, right, include_lowest): ) expected = cut(a, bins, right=right, include_lowest=include_lowest) tm.assert_categorical_equal(result, expected) + + +@pytest.mark.parametrize( + "data, bins, labels", + [ + ([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"]), + ([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2]), + ], +) +def test_cut_non_unique_labels(data, bins, labels): + result = cut(data, bins=bins, labels=labels, ordered=False) + expected = cut( + data, bins=bins, labels=Categorical(labels, ordered=False), ordered=False + ) + tm.assert_categorical_equal(result, expected) + + +@pytest.mark.parametrize( + "data, bins, labels", + [ + ([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"]), + ([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2]), + ], +) +def test_cut_unordered_labels(data, bins, labels): + result = cut(data, bins=bins, labels=labels, ordered=False) + expected = cut( + data, + bins=bins, + labels=Categorical(labels, categories=labels, ordered=False), + ordered=False, + ) + tm.assert_categorical_equal(result, expected) + + +@pytest.mark.parametrize("data, bins,", [([0.5, 3], [0, 1, 2])]) +def test_cut_unordered_with_missing_labels_raises_error(data, bins): + msg = "'labels' must be provided if 'ordered = False'" + with pytest.raises(ValueError, match=msg): + cut(data, bins=bins, ordered=False) From c7fc2aef590ce7f77cc1d0b6380a58258cc0700d Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Sat, 11 Apr 2020 17:48:19 +0200 Subject: [PATCH 2/8] BUG: Added parameter in the documentation. Pandas cut, add ordered option. (#33141) --- pandas/core/reshape/tile.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 5150c9a0a7c67..290c69e7638fa 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -74,7 +74,7 @@ def cut( the resulting bins. If False, returns only integer indicators of the bins. This affects the type of the output container (see below). This argument is ignored when `bins` is an IntervalIndex. If True, - raises an error. + raises an error. When `ordered=False`, labels must be provided. retbins : bool, default False Whether to return the bins or not. Useful when bins is provided as a scalar. @@ -86,6 +86,13 @@ def cut( If bin edges are not unique, raise ValueError or drop non-uniques. .. versionadded:: 0.23.0 + ordered : bool, default True + Whether the labels are ordered or not. Applies to returned types + Categorical and Series (with Categorical dtype). If True, + the resulting categorical will be ordered. If False, the resulting + categorical will be unordered (labels must be provided). + + .. versionadded:: 1.1.0 Returns ------- @@ -146,6 +153,13 @@ def cut( [bad, good, medium, medium, good, bad] Categories (3, object): [bad < medium < good] + ``ordered=False`` will result in unordered categories when labels are passed: + + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), + ... 3, labels=["B", "C", "A"]) + [B, A, C, C, A, B] + Categories (3, object): [B < C < A] + ``labels=False`` implies you just want the bins back. >>> pd.cut([0, 1, 1, 2], bins=4, labels=False) From 55c086ef97e1611c4b4ad185e3e150319b8d6bf8 Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Tue, 14 Apr 2020 15:24:06 +0200 Subject: [PATCH 3/8] BUG: Update added tests to use Categorical.from_codes. Update whats new lines. Updated docstrings (#33141) --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- pandas/core/reshape/tile.py | 11 ++++++----- pandas/tests/reshape/test_cut.py | 34 +++++++++++++++----------------- 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index dfdb318d28580..9f3658f49d9ab 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -91,6 +91,7 @@ Other enhancements - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). +- The :func:`cut` will now accept parameter ``ordered`` with default ``ordered=True``. If ``ordered=False`` and no labels are provided, an error will be raised (:issue:`33141`) .. --------------------------------------------------------------------------- @@ -540,8 +541,7 @@ Reshaping - Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`) - :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`) - Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) -- Bug in :func:`cut` when non-unique labels used an error raised. Added input parameter ``ordered`` to :func:`cut` with default (``ordered=True``). - If ``ordered=False`` and no labels are provided, an error will be raised (:issue:33141`) +- Bug in :func:`cut` raised an error when non-unique labels (:issue:`33141`) Sparse diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 290c69e7638fa..1d3e44ca37345 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -153,12 +153,13 @@ def cut( [bad, good, medium, medium, good, bad] Categories (3, object): [bad < medium < good] - ``ordered=False`` will result in unordered categories when labels are passed: + ``ordered=False`` will result in unordered categories when labels are passed. + This parameter can be used to allow non-unique labels: - >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), - ... 3, labels=["B", "C", "A"]) - [B, A, C, C, A, B] - Categories (3, object): [B < C < A] + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, + ... labels=["B", "A", "B"], ordered=False) + [B, B, A, A, B, B] + Categories (2, object): [A, B] ``labels=False`` implies you just want the bins back. diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 1cc9b7a3cf7fe..1598c3560c9c8 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -628,40 +628,38 @@ def test_cut_nullable_integer(bins, right, include_lowest): @pytest.mark.parametrize( - "data, bins, labels", + "data, bins, labels, expected_codes, expected_labels", [ - ([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"]), - ([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2]), + ([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"], [0, 1, 0], ["A", "B"]), + ([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2], [2, 0, 1], [0, 1, 2]), ], ) -def test_cut_non_unique_labels(data, bins, labels): +def test_cut_non_unique_labels(data, bins, labels, expected_codes, expected_labels): + # GH 33141 result = cut(data, bins=bins, labels=labels, ordered=False) - expected = cut( - data, bins=bins, labels=Categorical(labels, ordered=False), ordered=False + expected = Categorical.from_codes( + expected_codes, categories=expected_labels, ordered=False ) tm.assert_categorical_equal(result, expected) @pytest.mark.parametrize( - "data, bins, labels", + "data, bins, labels, expected_codes, expected_labels", [ - ([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"]), - ([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2]), + ([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"], [0, 1, 2], ["C", "B", "A"]), + ([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2], [0, 1, 2], [3, 0, 1, 2]), ], ) -def test_cut_unordered_labels(data, bins, labels): +def test_cut_unordered_labels(data, bins, labels, expected_codes, expected_labels): + # GH 33141 result = cut(data, bins=bins, labels=labels, ordered=False) - expected = cut( - data, - bins=bins, - labels=Categorical(labels, categories=labels, ordered=False), - ordered=False, + expected = Categorical.from_codes( + expected_codes, categories=expected_labels, ordered=False ) tm.assert_categorical_equal(result, expected) -@pytest.mark.parametrize("data, bins,", [([0.5, 3], [0, 1, 2])]) -def test_cut_unordered_with_missing_labels_raises_error(data, bins): +def test_cut_unordered_with_missing_labels_raises_error(): msg = "'labels' must be provided if 'ordered = False'" with pytest.raises(ValueError, match=msg): - cut(data, bins=bins, ordered=False) + cut([0.5, 3], bins=[0, 1, 2], ordered=False) From 777e13e75877763b95f458ce2031f385bb946e2a Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Sun, 26 Apr 2020 18:10:54 +0200 Subject: [PATCH 4/8] BUG: Added GH issue to test. Updated docstrings (#33141) --- pandas/tests/reshape/test_cut.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 1598c3560c9c8..60c80a8abdba6 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -660,6 +660,7 @@ def test_cut_unordered_labels(data, bins, labels, expected_codes, expected_label def test_cut_unordered_with_missing_labels_raises_error(): + # GH 33141 msg = "'labels' must be provided if 'ordered = False'" with pytest.raises(ValueError, match=msg): cut([0.5, 3], bins=[0, 1, 2], ordered=False) From 8671b110972cfdc1a9721e0fd01766fdcd45da26 Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Tue, 28 Apr 2020 18:18:53 +0200 Subject: [PATCH 5/8] BUG: Raise error when ordered=True and labels not unique. Updated docstrings (#33141) --- pandas/core/reshape/tile.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 1d3e44ca37345..43deb778e8696 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -383,6 +383,10 @@ def _bins_to_cuts( ): if not ordered and not labels: raise ValueError("'labels' must be provided if 'ordered = False'") + if ordered and len(set(labels)) != len(labels): + raise ValueError( + "labels must be unique if ordered=True; pass ordered=False for duplicate labels" # noqa + ) if duplicates not in ["raise", "drop"]: raise ValueError( @@ -431,13 +435,11 @@ def _bins_to_cuts( raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) - if not is_categorical_dtype(labels): - if len(set(labels)) == len(labels): - labels = Categorical(labels, categories=labels, ordered=ordered) - else: - labels = Categorical(labels, ordered=ordered) - + labels = Categorical( + labels, categories=labels if ordered else None, ordered=ordered + ) + # TODO: handle mismach between categorical label order and pandas.cut order. np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) From aa33ee0aa89e42015e603dc903d62c6445c2c3f5 Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Wed, 29 Apr 2020 19:05:40 +0200 Subject: [PATCH 6/8] BUG: Fix label checking in reshape.tile.py #33141 --- pandas/core/reshape/tile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 43deb778e8696..096eed9fe1c14 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -383,7 +383,8 @@ def _bins_to_cuts( ): if not ordered and not labels: raise ValueError("'labels' must be provided if 'ordered = False'") - if ordered and len(set(labels)) != len(labels): + + if labels and ordered and len(set(labels)) != len(labels): raise ValueError( "labels must be unique if ordered=True; pass ordered=False for duplicate labels" # noqa ) From d325fab797cba9d642ab5eac0923aadb7e4acf6e Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Thu, 30 Apr 2020 11:41:49 +0200 Subject: [PATCH 7/8] BUG: Move label checking in reshape.tile.py #33141 --- pandas/core/reshape/tile.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 096eed9fe1c14..279f8091bc114 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -384,11 +384,6 @@ def _bins_to_cuts( if not ordered and not labels: raise ValueError("'labels' must be provided if 'ordered = False'") - if labels and ordered and len(set(labels)) != len(labels): - raise ValueError( - "labels must be unique if ordered=True; pass ordered=False for duplicate labels" # noqa - ) - if duplicates not in ["raise", "drop"]: raise ValueError( "invalid value for 'duplicates' parameter, valid options are: raise, drop" @@ -430,7 +425,10 @@ def _bins_to_cuts( labels = _format_labels( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) - + elif ordered and len(set(labels)) != len(labels): + raise ValueError( + "labels must be unique if ordered=True; pass ordered=False for duplicate labels" # noqa + ) else: if len(labels) != len(bins) - 1: raise ValueError( From 972f1136b5741a8fcaf4bb18b2af3668007ce438 Mon Sep 17 00:00:00 2001 From: Mabel Villalba Date: Thu, 30 Apr 2020 11:54:22 +0200 Subject: [PATCH 8/8] BUG: Update categories condition to avoid just duplicates labels in reshape.tile.py #33141 --- pandas/core/reshape/tile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 279f8091bc114..345239eeb2372 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -436,7 +436,9 @@ def _bins_to_cuts( ) if not is_categorical_dtype(labels): labels = Categorical( - labels, categories=labels if ordered else None, ordered=ordered + labels, + categories=labels if len(set(labels)) == len(labels) else None, + ordered=ordered, ) # TODO: handle mismach between categorical label order and pandas.cut order. np.putmask(ids, na_mask, 0)