pandas-dev · jreback · May 10, 2018 · May 4, 2018 · May 5, 2018 · May 5, 2018
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -524,6 +524,7 @@ Other Enhancements
 - Added new writer for exporting Stata dta files in version 117, ``StataWriter117``.  This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`)
 - :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`)
 - :func:`date_range` now returns a linearly spaced ``DatetimeIndex`` if ``start``, ``stop``, and ``periods`` are specified, but ``freq`` is not. (:issue:`20808`)
+- ``pd.cut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`20947`)
 
 .. _whatsnew_0230.api_breaking:
 

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -24,7 +24,7 @@
 
 
 def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
-        include_lowest=False):
+        include_lowest=False, duplicates='raise'):
     """
     Bin values into discrete intervals.
 
@@ -65,6 +65,12 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
         The precision at which to store and display the bins labels.
     include_lowest : bool, default False
         Whether the first interval should be left-inclusive or not.
+    duplicates : {default 'raise', 'drop'}, optional
+        If bin edges are not unique, raise ValueError("Bin edges must be 
+        unique: {}.\nYou can drop duplicate edges by setting the "
+        "'duplicates' kwarg".format(repr(bins)))or drop non-uniques.
+
+        .. versionadded:: 0.23.0
 
     Returns
     -------
@@ -144,6 +150,32 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     dtype: category
     Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ...
 
+    Passing a Series as an input returns a Series with mapping value.
+    It is used to map numerically to intervals based on bins.
+
+    >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
+    ...               index=['a', 'b', 'c', 'd', 'e'])
+    >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
+    ... # doctest: +ELLIPSIS
+    (a    0.0
+     b    1.0
+     c    2.0
+     d    3.0
+     e    4.0
+     dtype: float64, array([0, 2, 4, 6, 8]))
+
+    ``duplicates=drop`` drop non-uniques
+
+    >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
+    ...    right=False, duplicates='drop')
+    ... # doctest: +ELLIPSIS
+    (a    0.0
+     b    1.0
+     c    2.0
+     d    3.0
+     e    3.0
+     dtype: float64, array([0, 2, 4, 6, 8]))
+
     Passing an IntervalIndex for `bins` results in those categories exactly.
     Notice that values not covered by the IntervalIndex are set to NaN. 0
     is to the left of the first bin (which is closed on the right), and 1.5
@@ -199,7 +231,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels,
                               precision=precision,
                               include_lowest=include_lowest,
-                              dtype=dtype)
+                              dtype=dtype,
+                              duplicates=duplicates)
 
     return _postprocess_for_cut(fac, bins, retbins, x_is_series,
                                 series_index, name)

diff --git a/pandas/tests/test_tile.py b/pandas/tests/test_tile.py
@@ -0,0 +1,25 @@
+import numpy as np
+
+from pandas import DataFrame, Series
+import pandas.util.testing as tm
+
+from pandas import cut
+
+
+class TestCut(object):
+
+    def test_cut_duplicates_drop(self):
+        values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])
+        results = cut(values, [0, 2, 4, 6, 10, 10], labels=False, right=False,
+                duplicates="drop")
+        expected = DataFrame({"a": 0,
+                              "b": 1,
+                              "c": 2,
+                              "d": 3,
+                              "e": 3})
+        assert_frame_equal(result, expected)
+
+    def test_cut_duplicates_raise(self):
+        values = Series(np.array([1, 3, 5, 7, 9]),index=["a", "b", "c", "d", "e"])
+        assertRaises(ValueError, cut, values, [0, 2, 4, 6, 10, 10],
+                duplicates='raise')