PERF: improve performance of groupby rank (#21237)

peterpanmj · peterpanmj · commit cb3f7788bdd3 · 2018-06-01T21:30:02.000+08:00
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -429,7 +429,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
     is_datetimelike : bool, default False
         unused in this method but provided for call compatibility with other
         Cython transformations
-    ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
+    ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
+        'average'
         * average: average rank of group
         * min: lowest rank in group
         * max: highest rank in group
@@ -513,26 +514,33 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
             # Used to calculate tiebreakers
             dups += 1
             sum_ranks += i - grp_start + 1
-
             # if keep_na, check for missing values and assign back
             # to the result where appropriate
-
             if keep_na and mask[_as[i]]:
                 grp_na_count += 1
                 out[_as[i], 0] = nan
-            else:
-                # this implementation is inefficient because it will
-                # continue overwriting previously encountered dups
-                # i.e. if 5 duplicated values are encountered it will
-                # write to the result as follows (assumes avg tiebreaker):
-                # 1
-                # .5  .5
-                # .33 .33 .33
-                # .25 .25 .25 .25
-                # .2  .2  .2  .2  .2
-                #
-                # could potentially be optimized to only write to the
-                # result once the last duplicate value is encountered
+                grp_vals_seen = i + 1
+                if (i== N-1) or (labels[_as[i]] != labels[_as[i+1]]):
+                    grp_tie_count +=1
+                    if tiebreak != TIEBREAK_DENSE:
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[_as[j], 0] = (i - grp_start + 1 -
+                                                    grp_na_count)
+                    else:
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[_as[j], 0] = (grp_tie_count -
+                                                    (grp_na_count > 0))
+                    dups = sum_ranks = 0
+                    grp_na_count = 0
+                    grp_tie_count = 0
+                    grp_start = i + 1
+                    grp_vals_seen = 1
+                continue
+
+            if (i == N - 1 or
+                    (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+                    (mask[_as[i]] ^ mask[_as[i+1]]) or
+                    (labels[_as[i]] != labels[_as[i+1]])):
                 if tiebreak == TIEBREAK_AVERAGE:
                     for j in range(i - dups + 1, i + 1):
                         out[_as[j], 0] = sum_ranks / <float64_t>dups
@@ -552,38 +560,38 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
                     for j in range(i - dups + 1, i + 1):
                         out[_as[j], 0] = grp_vals_seen
 
-            # look forward to the next value (using the sorting in _as)
-            # if the value does not equal the current value then we need to
-            # reset the dups and sum_ranks, knowing that a new value is coming
-            # up. the conditional also needs to handle nan equality and the
-            # end of iteration
-            if (i == N - 1 or
-                    (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
-                    (mask[_as[i]] ^ mask[_as[i+1]])):
-                dups = sum_ranks = 0
-                val_start = i
-                grp_vals_seen += 1
-                grp_tie_count +=1
-
-            # Similar to the previous conditional, check now if we are moving
-            # to a new group. If so, keep track of the index where the new
-            # group occurs, so the tiebreaker calculations can decrement that
-            # from their position. fill in the size of each group encountered
-            # (used by pct calculations later). also be sure to reset any of
-            # the items helping to calculate dups
-            if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
-                if tiebreak != TIEBREAK_DENSE:
-                    for j in range(grp_start, i + 1):
-                        grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
-                else:
-                    for j in range(grp_start, i + 1):
-                        grp_sizes[_as[j], 0] = (grp_tie_count -
-                                                (grp_na_count > 0))
-                dups = sum_ranks = 0
-                grp_na_count = 0
-                grp_tie_count = 0
-                grp_start = i + 1
-                grp_vals_seen = 1
+                # look forward to the next value (using the sorting in _as)
+                # if the value does not equal the current value then we need to
+                # reset the dups and sum_ranks, knowing that a new value is
+                # coming up. the conditional also needs to handle nan equality
+                # and the end of iteration
+                if (i == N - 1 or
+                        (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+                        (mask[_as[i]] ^ mask[_as[i+1]])):
+                    dups = sum_ranks = 0
+                    grp_vals_seen += 1
+                    grp_tie_count +=1
+
+                # Similar to the previous conditional, check now if we are
+                # moving to a new group. If so, keep track of the index where
+                # the new group occurs, so the tiebreaker calculations can
+                # decrement that from their position. fill in the size of each
+                # group encountered (used by pct calculations later). also be
+                # sure to reset any of the items helping to calculate dups
+                if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
+                    if tiebreak != TIEBREAK_DENSE:
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[_as[j], 0] = (i - grp_start + 1 -
+                                                    grp_na_count)
+                    else:
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[_as[j], 0] = (grp_tie_count -
+                                                    (grp_na_count > 0))
+                    dups = sum_ranks = 0
+                    grp_na_count = 0
+                    grp_tie_count = 0
+                    grp_start = i + 1
+                    grp_vals_seen = 1
 
         if pct:
             for i in range(N):