Skip to content

Commit c30f0bb

Browse files
FANGODjreback
authored andcommitted
Allow drop bins when using the cut function (#20947)
1 parent 8e2a4a9 commit c30f0bb

File tree

3 files changed

+52
-3
lines changed

3 files changed

+52
-3
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,7 @@ Other Enhancements
525525
library. (:issue:`20564`)
526526
- Added new writer for exporting Stata dta files in version 117, ``StataWriter117``. This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`)
527527
- :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`)
528+
- :func:`cut` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`20947`)
528529
- :func:`date_range` now returns a linearly spaced ``DatetimeIndex`` if ``start``, ``stop``, and ``periods`` are specified, but ``freq`` is not. (:issue:`20808`, :issue:`20983`)
529530

530531
.. _whatsnew_0230.api_breaking:

pandas/core/reshape/tile.py

+35-3
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525

2626
def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
27-
include_lowest=False):
27+
include_lowest=False, duplicates='raise'):
2828
"""
2929
Bin values into discrete intervals.
3030
@@ -65,6 +65,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
6565
The precision at which to store and display the bins labels.
6666
include_lowest : bool, default False
6767
Whether the first interval should be left-inclusive or not.
68+
duplicates : {default 'raise', 'drop'}, optional
69+
If bin edges are not unique, raise ValueError or drop non-uniques.
70+
71+
.. versionadded:: 0.23.0
6872
6973
Returns
7074
-------
@@ -85,7 +89,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
8589
bins : numpy.ndarray or IntervalIndex.
8690
The computed or specified bins. Only returned when `retbins=True`.
8791
For scalar or sequence `bins`, this is an ndarray with the computed
88-
bins. For an IntervalIndex `bins`, this is equal to `bins`.
92+
bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
93+
an IntervalIndex `bins`, this is equal to `bins`.
8994
9095
See Also
9196
--------
@@ -144,6 +149,32 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
144149
dtype: category
145150
Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ...
146151
152+
Passing a Series as an input returns a Series with mapping value.
153+
It is used to map numerically to intervals based on bins.
154+
155+
>>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
156+
... index=['a', 'b', 'c', 'd', 'e'])
157+
>>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
158+
... # doctest: +ELLIPSIS
159+
(a 0.0
160+
b 1.0
161+
c 2.0
162+
d 3.0
163+
e 4.0
164+
dtype: float64, array([0, 2, 4, 6, 8]))
165+
166+
Use `drop` optional when bins is not unique
167+
168+
>>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
169+
... right=False, duplicates='drop')
170+
... # doctest: +ELLIPSIS
171+
(a 0.0
172+
b 1.0
173+
c 2.0
174+
d 3.0
175+
e 3.0
176+
dtype: float64, array([0, 2, 4, 6, 8]))
177+
147178
Passing an IntervalIndex for `bins` results in those categories exactly.
148179
Notice that values not covered by the IntervalIndex are set to NaN. 0
149180
is to the left of the first bin (which is closed on the right), and 1.5
@@ -199,7 +230,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
199230
fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels,
200231
precision=precision,
201232
include_lowest=include_lowest,
202-
dtype=dtype)
233+
dtype=dtype,
234+
duplicates=duplicates)
203235

204236
return _postprocess_for_cut(fac, bins, retbins, x_is_series,
205237
series_index, name)

pandas/tests/reshape/test_tile.py

+16
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
from pandas.compat import zip
66

7+
import pandas as pd
78
from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index,
89
Timestamp, Interval, IntervalIndex, Categorical,
910
cut, qcut, date_range, NaT, TimedeltaIndex)
@@ -337,6 +338,21 @@ def test_series_retbins(self):
337338
CDT(ordered=True))
338339
tm.assert_series_equal(result, expected)
339340

341+
def test_cut_duplicates_bin(self):
342+
# issue 20947
343+
values = Series(np.array([1, 3, 5, 7, 9]),
344+
index=["a", "b", "c", "d", "e"])
345+
bins = [0, 2, 4, 6, 10, 10]
346+
result = cut(values, bins, duplicates='drop')
347+
expected = cut(values, pd.unique(bins))
348+
tm.assert_series_equal(result, expected)
349+
350+
pytest.raises(ValueError, cut, values, bins)
351+
pytest.raises(ValueError, cut, values, bins, duplicates='raise')
352+
353+
# invalid
354+
pytest.raises(ValueError, cut, values, bins, duplicates='foo')
355+
340356
def test_qcut_duplicates_bin(self):
341357
# GH 7751
342358
values = [0, 0, 0, 0, 1, 2, 3]

0 commit comments

Comments
 (0)