BUG: simplify the logic for rank calculation pandas-dev#20731

peterpanmj · peterpanmj · commit aa0043bc171c · 2018-05-27T11:37:28.000+08:00
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -452,18 +452,16 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
     cdef:
         TiebreakEnumType tiebreak
         Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0
-        Py_ssize_t grp_vals_seen=1, grp_na_count=0, lab_start=0, tie_count=0
-        Py_ssize_t grp_size=1
+        Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
         ndarray[int64_t] _as
+        ndarray[float64_t, ndim=2] grp_sizes
         ndarray[{{c_type}}] masked_vals
         ndarray[uint8_t] mask
         bint keep_na
-        bint has_na
         {{c_type}} nan_fill_val
 
     tiebreak = tiebreakers[ties_method]
     keep_na = na_option == 'keep'
-    has_na = False
     N, K = (<object> values).shape
     grp_sizes = np.ones_like(out)
 
@@ -520,7 +518,6 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
             # to the result where appropriate
 
             if keep_na and mask[_as[i]]:
-                if has_na == 0: has_na = 1
                 grp_na_count += 1
                 out[_as[i], 0] = nan
             else:
@@ -563,10 +560,10 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
             if (i == N - 1 or
                     (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
                     (mask[_as[i]] ^ mask[_as[i+1]])):
-                tie_count += 1
                 dups = sum_ranks = 0
                 val_start = i
                 grp_vals_seen += 1
+                grp_tie_count +=1
 
             # Similar to the previous conditional, check now if we are moving
             # to a new group. If so, keep track of the index where the new
@@ -575,22 +572,23 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
             # (used by pct calculations later). also be sure to reset any of
             # the items helping to calculate dups
             if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
-                if pct:
-                    if tiebreak != TIEBREAK_DENSE:
-                        for j in range(grp_start, i + 1):
-                            grp_size = i - grp_start + 1 - grp_na_count
-                            out[_as[j], 0] = out[_as[j], 0] / grp_size
-                    else:
-                        for j in range(lab_start, i + 1):
-                            out[_as[j], 0] = (out[_as[j], 0]
-                                              / (tie_count - has_na))
-                dups = sum_ranks = has_na = tie_count = 0
+                if tiebreak != TIEBREAK_DENSE:
+                    for j in range(grp_start, i + 1):
+                        grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
+                else:
+                    for j in range(grp_start, i + 1):
+                        grp_sizes[_as[j], 0] = (grp_tie_count
+                                                - (grp_na_count > 0))
+                dups = sum_ranks = 0
                 grp_na_count = 0
+                grp_tie_count = 0
                 val_start = i + 1
                 grp_start = i + 1
-                lab_start = i + 1
                 grp_vals_seen = 1
 
+        if pct:
+            for i in range(N):
+                out[i, 0] = out[i, 0] / grp_sizes[i, 0]
 {{endif}}
 {{endfor}}