Allow drop bins when using the cut function (#20947)

FANGOD · jreback · commit c30f0bbc94d4 · 2018-05-10T14:27:48.000-04:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -525,6 +525,7 @@ Other Enhancements
   library. (:issue:`20564`)
 - Added new writer for exporting Stata dta files in version 117, ``StataWriter117``.  This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`)
 - :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`)
+- :func:`cut` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`20947`)
 - :func:`date_range` now returns a linearly spaced ``DatetimeIndex`` if ``start``, ``stop``, and ``periods`` are specified, but ``freq`` is not. (:issue:`20808`, :issue:`20983`)
 
 .. _whatsnew_0230.api_breaking:
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -24,7 +24,7 @@
 
 
 def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
-        include_lowest=False):
+        include_lowest=False, duplicates='raise'):
     """
     Bin values into discrete intervals.
 
@@ -65,6 +65,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
         The precision at which to store and display the bins labels.
     include_lowest : bool, default False
         Whether the first interval should be left-inclusive or not.
+    duplicates : {default 'raise', 'drop'}, optional
+        If bin edges are not unique, raise ValueError or drop non-uniques.
+
+        .. versionadded:: 0.23.0
 
     Returns
     -------
@@ -85,7 +89,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     bins : numpy.ndarray or IntervalIndex.
         The computed or specified bins. Only returned when `retbins=True`.
         For scalar or sequence `bins`, this is an ndarray with the computed
-        bins. For an IntervalIndex `bins`, this is equal to `bins`.
+        bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
+        an IntervalIndex `bins`, this is equal to `bins`.
 
     See Also
     --------
@@ -144,6 +149,32 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     dtype: category
     Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ...
 
+    Passing a Series as an input returns a Series with mapping value.
+    It is used to map numerically to intervals based on bins.
+
+    >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
+    ...               index=['a', 'b', 'c', 'd', 'e'])
+    >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
+    ... # doctest: +ELLIPSIS
+    (a    0.0
+     b    1.0
+     c    2.0
+     d    3.0
+     e    4.0
+     dtype: float64, array([0, 2, 4, 6, 8]))
+
+    Use `drop` optional when bins is not unique
+
+    >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
+    ...    right=False, duplicates='drop')
+    ... # doctest: +ELLIPSIS
+    (a    0.0
+     b    1.0
+     c    2.0
+     d    3.0
+     e    3.0
+     dtype: float64, array([0, 2, 4, 6, 8]))
+
     Passing an IntervalIndex for `bins` results in those categories exactly.
     Notice that values not covered by the IntervalIndex are set to NaN. 0
     is to the left of the first bin (which is closed on the right), and 1.5
@@ -199,7 +230,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels,
                               precision=precision,
                               include_lowest=include_lowest,
-                              dtype=dtype)
+                              dtype=dtype,
+                              duplicates=duplicates)
 
     return _postprocess_for_cut(fac, bins, retbins, x_is_series,
                                 series_index, name)
diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py
@@ -4,6 +4,7 @@
 import numpy as np
 from pandas.compat import zip
 
+import pandas as pd
 from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index,
                     Timestamp, Interval, IntervalIndex, Categorical,
                     cut, qcut, date_range, NaT, TimedeltaIndex)
@@ -337,6 +338,21 @@ def test_series_retbins(self):
             CDT(ordered=True))
         tm.assert_series_equal(result, expected)
 
+    def test_cut_duplicates_bin(self):
+        # issue 20947
+        values = Series(np.array([1, 3, 5, 7, 9]),
+                        index=["a", "b", "c", "d", "e"])
+        bins = [0, 2, 4, 6, 10, 10]
+        result = cut(values, bins, duplicates='drop')
+        expected = cut(values, pd.unique(bins))
+        tm.assert_series_equal(result, expected)
+
+        pytest.raises(ValueError, cut, values, bins)
+        pytest.raises(ValueError, cut, values, bins, duplicates='raise')
+
+        # invalid
+        pytest.raises(ValueError, cut, values, bins, duplicates='foo')
+
     def test_qcut_duplicates_bin(self):
         # GH 7751
         values = [0, 0, 0, 0, 1, 2, 3]