Skip to content

Commit bee981c

Browse files
committed
adding 'duplicates' option to qcut
1 parent 7f0eefc commit bee981c

File tree

2 files changed

+23
-3
lines changed

2 files changed

+23
-3
lines changed

pandas/tools/tests/test_tile.py

+7
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,13 @@ def test_series_retbins(self):
272272
np.array([0, 0, 1, 1], dtype=np.int8))
273273
tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3]))
274274

275+
def test_qcut_duplicates_drop(self):
276+
# GH 7751

277+
values = [0, 0, 0, 0, 1, 2, 3]
278+
cats = qcut(values, 3, duplicates='drop')
279+
ex_levels = ['[0, 1]', '(1, 3]']
280+
self.assertTrue((cats.categories == ex_levels).all())
281+
275282
def test_single_bin(self):
276283
# issue 14652
277284
expected = Series([0, 0])

pandas/tools/tile.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
129129
series_index, name)
130130

131131

132-
def qcut(x, q, labels=None, retbins=False, precision=3):
132+
def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
133133
"""
134134
Quantile-based discretization function. Discretize variable into
135135
equal-sized buckets based on rank or based on sample quantiles. For example
@@ -151,6 +151,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3):
151151
as a scalar.
152152
precision : int
153153
The precision at which to store and display the bins labels
154+
duplicates : {'raise', 'drop'}, optional

155+
If binned edges are not unique, raise ValueError or drop non-
uniques.
154156
155157
Returns
156158
-------
@@ -187,15 +189,26 @@ def qcut(x, q, labels=None, retbins=False, precision=3):
187189
bins = algos.quantile(x, quantiles)
188190
fac, bins = _bins_to_cuts(x, bins, labels=labels,
189191
precision=precision, include_lowest=True,
190-
dtype=dtype)
192+
dtype=dtype, duplicates=duplicates)
191193

192194
return _postprocess_for_cut(fac, bins, retbins, x_is_series,
193195
series_index, name)
194196

195197

196198
def _bins_to_cuts(x, bins, right=True, labels=None,
197199
precision=3, include_lowest=False,
198-
dtype=None):
200+
dtype=None, duplicates='raise'):
201+
202+
if duplicates not in ['raise', 'drop']:
203+
raise ValueError("invalid value for 'duplicates' parameter, "
204+
+ "valid options are: raise, drop")
205+
206+
if duplicates == 'raise':
207+
raise ValueError('Bin edges must be unique: %s' % repr(bins) +
208+
' You can drop duplicate edges ' +
209+
'by setting \'duplicates\' param')
210+
else:
211+
bins = algos.unique(bins)
199212

200213
side = 'left' if right else 'right'
201214
ids = bins.searchsorted(x, side=side)

0 commit comments

Comments
 (0)