Skip to content

Commit cb3f778

Browse files
committed
PERF: improve performance of groupby rank (#21237)
1 parent 4274b84 commit cb3f778

File tree

1 file changed

+56
-48
lines changed

1 file changed

+56
-48
lines changed

pandas/_libs/groupby_helper.pxi.in

+56-48
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
429429
is_datetimelike : bool, default False
430430
unused in this method but provided for call compatibility with other
431431
Cython transformations
432-
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
432+
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
433+
'average'
433434
* average: average rank of group
434435
* min: lowest rank in group
435436
* max: highest rank in group
@@ -513,26 +514,33 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
513514
# Used to calculate tiebreakers
514515
dups += 1
515516
sum_ranks += i - grp_start + 1
516-
517517
# if keep_na, check for missing values and assign back
518518
# to the result where appropriate
519-
520519
if keep_na and mask[_as[i]]:
521520
grp_na_count += 1
522521
out[_as[i], 0] = nan
523-
else:
524-
# this implementation is inefficient because it will
525-
# continue overwriting previously encountered dups
526-
# i.e. if 5 duplicated values are encountered it will
527-
# write to the result as follows (assumes avg tiebreaker):
528-
# 1
529-
# .5 .5
530-
# .33 .33 .33
531-
# .25 .25 .25 .25
532-
# .2 .2 .2 .2 .2
533-
#
534-
# could potentially be optimized to only write to the
535-
# result once the last duplicate value is encountered
522+
grp_vals_seen = i + 1
523+
if (i== N-1) or (labels[_as[i]] != labels[_as[i+1]]):
524+
grp_tie_count +=1
525+
if tiebreak != TIEBREAK_DENSE:
526+
for j in range(grp_start, i + 1):
527+
grp_sizes[_as[j], 0] = (i - grp_start + 1 -
528+
grp_na_count)
529+
else:
530+
for j in range(grp_start, i + 1):
531+
grp_sizes[_as[j], 0] = (grp_tie_count -
532+
(grp_na_count > 0))
533+
dups = sum_ranks = 0
534+
grp_na_count = 0
535+
grp_tie_count = 0
536+
grp_start = i + 1
537+
grp_vals_seen = 1
538+
continue
539+
540+
if (i == N - 1 or
541+
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
542+
(mask[_as[i]] ^ mask[_as[i+1]]) or
543+
(labels[_as[i]] != labels[_as[i+1]])):
536544
if tiebreak == TIEBREAK_AVERAGE:
537545
for j in range(i - dups + 1, i + 1):
538546
out[_as[j], 0] = sum_ranks / <float64_t>dups
@@ -552,38 +560,38 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
552560
for j in range(i - dups + 1, i + 1):
553561
out[_as[j], 0] = grp_vals_seen
554562

555-
# look forward to the next value (using the sorting in _as)
556-
# if the value does not equal the current value then we need to
557-
# reset the dups and sum_ranks, knowing that a new value is coming
558-
# up. the conditional also needs to handle nan equality and the
559-
# end of iteration
560-
if (i == N - 1 or
561-
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
562-
(mask[_as[i]] ^ mask[_as[i+1]])):
563-
dups = sum_ranks = 0
564-
val_start = i
565-
grp_vals_seen += 1
566-
grp_tie_count +=1
567-
568-
# Similar to the previous conditional, check now if we are moving
569-
# to a new group. If so, keep track of the index where the new
570-
# group occurs, so the tiebreaker calculations can decrement that
571-
# from their position. fill in the size of each group encountered
572-
# (used by pct calculations later). also be sure to reset any of
573-
# the items helping to calculate dups
574-
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
575-
if tiebreak != TIEBREAK_DENSE:
576-
for j in range(grp_start, i + 1):
577-
grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
578-
else:
579-
for j in range(grp_start, i + 1):
580-
grp_sizes[_as[j], 0] = (grp_tie_count -
581-
(grp_na_count > 0))
582-
dups = sum_ranks = 0
583-
grp_na_count = 0
584-
grp_tie_count = 0
585-
grp_start = i + 1
586-
grp_vals_seen = 1
563+
# look forward to the next value (using the sorting in _as)
564+
# if the value does not equal the current value then we need to
565+
# reset the dups and sum_ranks, knowing that a new value is
566+
# coming up. the conditional also needs to handle nan equality
567+
# and the end of iteration
568+
if (i == N - 1 or
569+
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
570+
(mask[_as[i]] ^ mask[_as[i+1]])):
571+
dups = sum_ranks = 0
572+
grp_vals_seen += 1
573+
grp_tie_count +=1
574+
575+
# Similar to the previous conditional, check now if we are
576+
# moving to a new group. If so, keep track of the index where
577+
# the new group occurs, so the tiebreaker calculations can
578+
# decrement that from their position. fill in the size of each
579+
# group encountered (used by pct calculations later). also be
580+
# sure to reset any of the items helping to calculate dups
581+
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
582+
if tiebreak != TIEBREAK_DENSE:
583+
for j in range(grp_start, i + 1):
584+
grp_sizes[_as[j], 0] = (i - grp_start + 1 -
585+
grp_na_count)
586+
else:
587+
for j in range(grp_start, i + 1):
588+
grp_sizes[_as[j], 0] = (grp_tie_count -
589+
(grp_na_count > 0))
590+
dups = sum_ranks = 0
591+
grp_na_count = 0
592+
grp_tie_count = 0
593+
grp_start = i + 1
594+
grp_vals_seen = 1
587595

588596
if pct:
589597
for i in range(N):

0 commit comments

Comments
 (0)