Skip to content

Commit 8051d61

Browse files
ashishsingal1jreback
authored andcommitted
ERR: qcut uniquess checking (try 2)
closes #7751 Add option to drop non-unique bins in qcut/cut Author: Ashish Singal <[email protected]> Author: Ashish <[email protected]> Closes #15000 from ashishsingal1/master and squashes the following commits: 698b4ec [Ashish Singal] Update tile.py b6bf401 [Ashish Singal] Update v0.20.0.txt 42bf482 [Ashish Singal] Update tile.py 221c0b3 [Ashish Singal] Update tile.py 2c5bc35 [Ashish] added duplicates='raise' test. other fixes to qcut for duplicates='raise' 3dbc416 [Ashish Singal] Update v0.20.0.txt 2161518 [Ashish Singal] Update tile.py 1ce77d0 [Ashish Singal] Update test_tile.py 3f98abc [Ashish Singal] Update tile.py 0b8efeb [Ashish Singal] Update tile.py a2dd8ce [Ashish] fixing duplicates check bee981c [Ashish] adding 'duplicates' option to qcut
1 parent 17d7ddb commit 8051d61

File tree

3 files changed

+33
-6
lines changed

3 files changed

+33
-6
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ Other enhancements
105105
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`
106106

107107
- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`)
108+
- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`)
108109
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
109110
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
110111
- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)

pandas/tools/tests/test_tile.py

+12
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,18 @@ def test_series_retbins(self):
272272
np.array([0, 0, 1, 1], dtype=np.int8))
273273
tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3]))
274274

275+
def test_qcut_duplicates_drop(self):
276+
# GH 7751
277+
values = [0, 0, 0, 0, 1, 2, 3]
278+
cats = qcut(values, 3, duplicates='drop')
279+
ex_levels = ['[0, 1]', '(1, 3]']
280+
self.assertTrue((cats.categories == ex_levels).all())
281+
282+
def test_qcut_duplicates_raise(self):
283+
# GH 7751
284+
values = [0, 0, 0, 0, 1, 2, 3]
285+
self.assertRaises(ValueError, qcut, values, 3, duplicates='raise')
286+
275287
def test_single_bin(self):
276288
# issue 14652
277289
expected = Series([0, 0])

pandas/tools/tile.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
129129
series_index, name)
130130

131131

132-
def qcut(x, q, labels=None, retbins=False, precision=3):
132+
def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
133133
"""
134134
Quantile-based discretization function. Discretize variable into
135135
equal-sized buckets based on rank or based on sample quantiles. For example
@@ -151,6 +151,10 @@ def qcut(x, q, labels=None, retbins=False, precision=3):
151151
as a scalar.
152152
precision : int
153153
The precision at which to store and display the bins labels
154+
duplicates : {default 'raise', 'drop'}, optional
155+
If bin edges are not unique, raise ValueError or drop non-uniques.
156+
157+
.. versionadded:: 0.20.0
154158
155159
Returns
156160
-------
@@ -187,22 +191,32 @@ def qcut(x, q, labels=None, retbins=False, precision=3):
187191
bins = algos.quantile(x, quantiles)
188192
fac, bins = _bins_to_cuts(x, bins, labels=labels,
189193
precision=precision, include_lowest=True,
190-
dtype=dtype)
194+
dtype=dtype, duplicates=duplicates)
191195

192196
return _postprocess_for_cut(fac, bins, retbins, x_is_series,
193197
series_index, name)
194198

195199

196200
def _bins_to_cuts(x, bins, right=True, labels=None,
197201
precision=3, include_lowest=False,
198-
dtype=None):
202+
dtype=None, duplicates='raise'):
203+
204+
if duplicates not in ['raise', 'drop']:
205+
raise ValueError("invalid value for 'duplicates' parameter, "
206+
"valid options are: raise, drop")
207+
208+
unique_bins = algos.unique(bins)
209+
if len(unique_bins) < len(bins):
210+
if duplicates == 'raise':
211+
raise ValueError("Bin edges must be unique: {}. You "
212+
"can drop duplicate edges by setting "
213+
"'duplicates' param".format(repr(bins)))
214+
else:
215+
bins = unique_bins
199216

200217
side = 'left' if right else 'right'
201218
ids = bins.searchsorted(x, side=side)
202219

203-
if len(algos.unique(bins)) < len(bins):
204-
raise ValueError('Bin edges must be unique: %s' % repr(bins))
205-
206220
if include_lowest:
207221
ids[x == bins[0]] = 1
208222

0 commit comments

Comments
 (0)