-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
PERF: Fixed cut regression, improve Categorical #34952
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -413,6 +413,7 @@ def _bins_to_cuts( | |
|
||
na_mask = isna(x) | (ids == len(bins)) | (ids == 0) | ||
has_nas = na_mask.any() | ||
labels_are_unique = None | ||
|
||
if labels is not False: | ||
if not (labels is None or is_list_like(labels)): | ||
|
@@ -425,6 +426,7 @@ def _bins_to_cuts( | |
labels = _format_labels( | ||
bins, precision, right=right, include_lowest=include_lowest, dtype=dtype | ||
) | ||
labels_are_unique = True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking again, it may not be safe to assume that |
||
elif ordered and len(set(labels)) != len(labels): | ||
raise ValueError( | ||
"labels must be unique if ordered=True; pass ordered=False for duplicate labels" # noqa | ||
|
@@ -435,11 +437,14 @@ def _bins_to_cuts( | |
"Bin labels must be one fewer than the number of bin edges" | ||
) | ||
if not is_categorical_dtype(labels): | ||
labels = Categorical( | ||
labels, | ||
categories=labels if len(set(labels)) == len(labels) else None, | ||
ordered=ordered, | ||
) | ||
if labels_are_unique: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why this extra logic? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry... I thought I reverted this. Forgot to push. But, the |
||
categories = labels | ||
elif labels_are_unique is None: | ||
categories = labels if len(set(labels)) == len(labels) else None | ||
else: | ||
categories = None | ||
|
||
labels = Categorical(labels, categories=categories, ordered=ordered) | ||
# TODO: handle mismatch between categorical label order and pandas.cut order. | ||
np.putmask(ids, na_mask, 0) | ||
result = algos.take_nd(labels, ids - 1) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm shouldn't this be in the caller to this routine?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's best here. We call it in ~3 places, and I think each would have to repeat this check & the object casting stuff.