diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0873e4b34b0b1..64c5de6cb100a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -105,6 +105,7 @@ Other enhancements of sorting or an incorrect key. See :ref:`here ` - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`) +- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index c9a96d80f35ba..8b180957801f9 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -272,6 +272,18 @@ def test_series_retbins(self): np.array([0, 0, 1, 1], dtype=np.int8)) tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) + def test_qcut_duplicates_drop(self): + # GH 7751 + values = [0, 0, 0, 0, 1, 2, 3] + cats = qcut(values, 3, duplicates='drop') + ex_levels = ['[0, 1]', '(1, 3]'] + self.assertTrue((cats.categories == ex_levels).all()) + + def test_qcut_duplicates_raise(self): + # GH 7751 + values = [0, 0, 0, 0, 1, 2, 3] + self.assertRaises(ValueError, qcut, values, 3, duplicates='raise') + def test_single_bin(self): # issue 14652 expected = Series([0, 0]) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index a372e113f1d7e..2875d9c14dc47 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -129,7 +129,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, series_index, name) -def qcut(x, q, labels=None, retbins=False, precision=3): +def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example @@ -151,6 +151,10 @@ def qcut(x, q, labels=None, retbins=False, precision=3): as a scalar. precision : int The precision at which to store and display the bins labels + duplicates : {default 'raise', 'drop'}, optional + If bin edges are not unique, raise ValueError or drop non-uniques. + + .. versionadded:: 0.20.0 Returns ------- @@ -187,7 +191,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3): bins = algos.quantile(x, quantiles) fac, bins = _bins_to_cuts(x, bins, labels=labels, precision=precision, include_lowest=True, - dtype=dtype) + dtype=dtype, duplicates=duplicates) return _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name) @@ -195,14 +199,24 @@ def qcut(x, q, labels=None, retbins=False, precision=3): def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, - dtype=None): + dtype=None, duplicates='raise'): + + if duplicates not in ['raise', 'drop']: + raise ValueError("invalid value for 'duplicates' parameter, " + "valid options are: raise, drop") + + unique_bins = algos.unique(bins) + if len(unique_bins) < len(bins): + if duplicates == 'raise': + raise ValueError("Bin edges must be unique: {}. You " + "can drop duplicate edges by setting " + "'duplicates' param".format(repr(bins))) + else: + bins = unique_bins side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) - if len(algos.unique(bins)) < len(bins): - raise ValueError('Bin edges must be unique: %s' % repr(bins)) - if include_lowest: ids[x == bins[0]] = 1