Skip to content

Commit dca8d48

Browse files
committed
Simplify handling of nan values in groupby rank
1 parent 09238d3 commit dca8d48

File tree

2 files changed

+10
-28
lines changed

2 files changed

+10
-28
lines changed

doc/source/whatsnew/v0.24.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ Removal of prior version deprecations/changes
6363
Performance Improvements
6464
~~~~~~~~~~~~~~~~~~~~~~~~
6565

66-
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` (:issue:`21237`)
66+
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
6767
-
6868
-
6969

pandas/_libs/groupby_helper.pxi.in

Lines changed: 9 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -514,30 +514,6 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
514514
# Used to calculate tiebreakers
515515
dups += 1
516516
sum_ranks += i - grp_start + 1
517-
# if keep_na, check for missing values and assign back
518-
# to the result where appropriate
519-
if keep_na and mask[_as[i]]:
520-
grp_na_count += 1
521-
out[_as[i], 0] = nan
522-
grp_vals_seen = i + 1
523-
# when label transition happens, update grp_size to keep track
524-
# of number of nans encountered and increment grp_tie_count
525-
if (i== N-1) or (labels[_as[i]] != labels[_as[i+1]]):
526-
grp_tie_count +=1
527-
if tiebreak != TIEBREAK_DENSE:
528-
for j in range(grp_start, i + 1):
529-
grp_sizes[_as[j], 0] = (i - grp_start + 1 -
530-
grp_na_count)
531-
else:
532-
for j in range(grp_start, i + 1):
533-
grp_sizes[_as[j], 0] = (grp_tie_count -
534-
(grp_na_count > 0))
535-
dups = sum_ranks = 0
536-
grp_na_count = 0
537-
grp_tie_count = 0
538-
grp_start = i + 1
539-
grp_vals_seen = 1
540-
continue
541517

542518
# Update out only when there is a transition of values or labels.
543519
# When a new value or group is encountered, go back #dups steps(
@@ -548,7 +524,13 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
548524
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
549525
(mask[_as[i]] ^ mask[_as[i+1]]) or
550526
(labels[_as[i]] != labels[_as[i+1]])):
551-
if tiebreak == TIEBREAK_AVERAGE:
527+
# if keep_na, check for missing values and assign back
528+
# to the result where appropriate
529+
if keep_na and mask[_as[i]]:
530+
for j in range(i - dups + 1, i + 1):
531+
out[_as[j], 0] = nan
532+
grp_na_count = dups
533+
elif tiebreak == TIEBREAK_AVERAGE:
552534
for j in range(i - dups + 1, i + 1):
553535
out[_as[j], 0] = sum_ranks / <float64_t>dups
554536
elif tiebreak == TIEBREAK_MIN:
@@ -570,14 +552,14 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
570552
# look forward to the next value (using the sorting in _as)
571553
# if the value does not equal the current value then we need to
572554
# reset the dups and sum_ranks, knowing that a new value is
573-
# coming up. the condition also needs to handle nan equality
555+
# coming up. the conditional also needs to handle nan equality
574556
# and the end of iteration
575557
if (i == N - 1 or
576558
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
577559
(mask[_as[i]] ^ mask[_as[i+1]])):
578560
dups = sum_ranks = 0
579561
grp_vals_seen += 1
580-
grp_tie_count +=1
562+
grp_tie_count += 1
581563

582564
# Similar to the previous conditional, check now if we are
583565
# moving to a new group. If so, keep track of the index where

0 commit comments

Comments
 (0)