-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: improve performance of groupby rank (#21237) #21285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -429,7 +429,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, | |
is_datetimelike : bool, default False | ||
unused in this method but provided for call compatibility with other | ||
Cython transformations | ||
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' | ||
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default | ||
'average' | ||
* average: average rank of group | ||
* min: lowest rank in group | ||
* max: highest rank in group | ||
|
@@ -514,26 +515,22 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, | |
dups += 1 | ||
sum_ranks += i - grp_start + 1 | ||
|
||
# if keep_na, check for missing values and assign back | ||
# to the result where appropriate | ||
|
||
if keep_na and mask[_as[i]]: | ||
grp_na_count += 1 | ||
out[_as[i], 0] = nan | ||
else: | ||
# this implementation is inefficient because it will | ||
# continue overwriting previously encountered dups | ||
# i.e. if 5 duplicated values are encountered it will | ||
# write to the result as follows (assumes avg tiebreaker): | ||
# 1 | ||
# .5 .5 | ||
# .33 .33 .33 | ||
# .25 .25 .25 .25 | ||
# .2 .2 .2 .2 .2 | ||
# | ||
# could potentially be optimized to only write to the | ||
# result once the last duplicate value is encountered | ||
if tiebreak == TIEBREAK_AVERAGE: | ||
# Update out only when there is a transition of values or labels. | ||
# When a new value or group is encountered, go back #dups steps( | ||
# the number of occurrence of current value) and assign the ranks | ||
# based on the the starting index of the current group (grp_start) | ||
# and the current index | ||
if (i == N - 1 or | ||
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or | ||
(mask[_as[i]] ^ mask[_as[i+1]]) or | ||
(labels[_as[i]] != labels[_as[i+1]])): | ||
# if keep_na, check for missing values and assign back | ||
# to the result where appropriate | ||
if keep_na and mask[_as[i]]: | ||
for j in range(i - dups + 1, i + 1): | ||
out[_as[j], 0] = nan | ||
grp_na_count = dups | ||
elif tiebreak == TIEBREAK_AVERAGE: | ||
for j in range(i - dups + 1, i + 1): | ||
out[_as[j], 0] = sum_ranks / <float64_t>dups | ||
elif tiebreak == TIEBREAK_MIN: | ||
|
@@ -552,38 +549,38 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, | |
for j in range(i - dups + 1, i + 1): | ||
out[_as[j], 0] = grp_vals_seen | ||
|
||
# look forward to the next value (using the sorting in _as) | ||
# if the value does not equal the current value then we need to | ||
# reset the dups and sum_ranks, knowing that a new value is coming | ||
# up. the conditional also needs to handle nan equality and the | ||
# end of iteration | ||
if (i == N - 1 or | ||
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or | ||
(mask[_as[i]] ^ mask[_as[i+1]])): | ||
dups = sum_ranks = 0 | ||
val_start = i | ||
grp_vals_seen += 1 | ||
grp_tie_count +=1 | ||
|
||
# Similar to the previous conditional, check now if we are moving | ||
# to a new group. If so, keep track of the index where the new | ||
# group occurs, so the tiebreaker calculations can decrement that | ||
# from their position. fill in the size of each group encountered | ||
# (used by pct calculations later). also be sure to reset any of | ||
# the items helping to calculate dups | ||
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: | ||
if tiebreak != TIEBREAK_DENSE: | ||
for j in range(grp_start, i + 1): | ||
grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count | ||
else: | ||
for j in range(grp_start, i + 1): | ||
grp_sizes[_as[j], 0] = (grp_tie_count - | ||
(grp_na_count > 0)) | ||
dups = sum_ranks = 0 | ||
grp_na_count = 0 | ||
grp_tie_count = 0 | ||
grp_start = i + 1 | ||
grp_vals_seen = 1 | ||
# look forward to the next value (using the sorting in _as) | ||
# if the value does not equal the current value then we need to | ||
# reset the dups and sum_ranks, knowing that a new value is | ||
# coming up. the conditional also needs to handle nan equality | ||
# and the end of iteration | ||
if (i == N - 1 or | ||
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or | ||
(mask[_as[i]] ^ mask[_as[i+1]])): | ||
dups = sum_ranks = 0 | ||
grp_vals_seen += 1 | ||
grp_tie_count += 1 | ||
|
||
# Similar to the previous conditional, check now if we are | ||
# moving to a new group. If so, keep track of the index where | ||
# the new group occurs, so the tiebreaker calculations can | ||
# decrement that from their position. fill in the size of each | ||
# group encountered (used by pct calculations later). also be | ||
# sure to reset any of the items helping to calculate dups | ||
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the above is true could maybe use else more effective here to reduce code There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about i == N -1 scenario ? I can rewrite it to the following
This is not equivalent to my current fix.
It is less readable There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm OK thanks for talking me through - I think you are right that that wouldn't improve readability. |
||
if tiebreak != TIEBREAK_DENSE: | ||
for j in range(grp_start, i + 1): | ||
grp_sizes[_as[j], 0] = (i - grp_start + 1 - | ||
grp_na_count) | ||
else: | ||
for j in range(grp_start, i + 1): | ||
grp_sizes[_as[j], 0] = (grp_tie_count - | ||
(grp_na_count > 0)) | ||
dups = sum_ranks = 0 | ||
grp_na_count = 0 | ||
grp_tie_count = 0 | ||
grp_start = i + 1 | ||
grp_vals_seen = 1 | ||
|
||
if pct: | ||
for i in range(N): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this be simplified to just:
Since all of the other conditions are already accounted for higher up in the scope?