-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ERR: qcut uniquess checking (try 2) #15000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
bee981c
a2dd8ce
0b8efeb
3f98abc
1ce77d0
2161518
3dbc416
2c5bc35
221c0b3
42bf482
b6bf401
698b4ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -272,6 +272,18 @@ def test_series_retbins(self): | |
np.array([0, 0, 1, 1], dtype=np.int8)) | ||
tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) | ||
|
||
def test_qcut_duplicates_drop(self): | ||
# GH 7751 | ||
values = [0, 0, 0, 0, 1, 2, 3] | ||
cats = qcut(values, 3, duplicates='drop') | ||
ex_levels = ['[0, 1]', '(1, 3]'] | ||
self.assertTrue((cats.categories == ex_levels).all()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a test for |
||
|
||
def test_qcut_duplicates_raise(self): | ||
# GH 7751 | ||
values = [0, 0, 0, 0, 1, 2, 3] | ||
self.assertRaises(ValueError, qcut, values, 3, duplicates='raise') | ||
|
||
def test_single_bin(self): | ||
# issue 14652 | ||
expected = Series([0, 0]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -129,7 +129,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, | |
series_index, name) | ||
|
||
|
||
def qcut(x, q, labels=None, retbins=False, precision=3): | ||
def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): | ||
""" | ||
Quantile-based discretization function. Discretize variable into | ||
equal-sized buckets based on rank or based on sample quantiles. For example | ||
|
@@ -151,6 +151,9 @@ def qcut(x, q, labels=None, retbins=False, precision=3): | |
as a scalar. | ||
precision : int | ||
The precision at which to store and display the bins labels | ||
duplicates : {'raise', 'drop'}, optional | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add "default 'raise'" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or you can also add it in the description like "Default is to raise an error" (one of both is good) |
||
If bin edges are not unique, raise ValueError or drop non-uniques. | ||
.. versionadded:: 0.20.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There needs to be a whiteline above this one (rst specifics ...) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There needs to be a blank line above |
||
|
||
Returns | ||
------- | ||
|
@@ -187,22 +190,32 @@ def qcut(x, q, labels=None, retbins=False, precision=3): | |
bins = algos.quantile(x, quantiles) | ||
fac, bins = _bins_to_cuts(x, bins, labels=labels, | ||
precision=precision, include_lowest=True, | ||
dtype=dtype) | ||
dtype=dtype, duplicates=duplicates) | ||
|
||
return _postprocess_for_cut(fac, bins, retbins, x_is_series, | ||
series_index, name) | ||
|
||
|
||
def _bins_to_cuts(x, bins, right=True, labels=None, | ||
precision=3, include_lowest=False, | ||
dtype=None): | ||
dtype=None, duplicates='raise'): | ||
|
||
if duplicates not in ['raise', 'drop']: | ||
raise ValueError("invalid value for 'duplicates' parameter, " | ||
"valid options are: raise, drop") | ||
|
||
unique_bins = algos.unique(bins) | ||
if len(unique_bins) < len(bins): | ||
if duplicates == 'raise': | ||
raise ValueError("Bin edges must be unique: {}. You " | ||
"can drop duplicate edges by setting " | ||
"'duplicates' param".format(repr(bins))) | ||
else: | ||
bins = unique_bins | ||
|
||
side = 'left' if right else 'right' | ||
ids = bins.searchsorted(x, side=side) | ||
|
||
if len(algos.unique(bins)) < len(bins): | ||
raise ValueError('Bin edges must be unique: %s' % repr(bins)) | ||
|
||
if include_lowest: | ||
ids[x == bins[0]] = 1 | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pd.cut/qcut
have gained theduplicates
kw to control whether to raise on duplicated edges.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this relevant for cut or just qcut?