PERF: improve performance of groupby rank (pandas-dev#21237)

peterpanmj · peterpanmj · commit fbb05d40ff64 · 2018-06-13T11:58:51.000+08:00
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 from pandas import (DataFrame, Series, MultiIndex, date_range, period_range,
-                    TimeGrouper, Categorical)
+                    TimeGrouper, Categorical, Timestamp)
 import pandas.util.testing as tm
 
 from .pandas_vb_common import setup  # noqa
@@ -385,6 +385,25 @@ def time_dtype_as_field(self, dtype, method, application):
         self.as_field_method()
 
 
+class RankWithTies(object):
+    # GH 21237
+    goal_time = 0.2
+    param_names = ['dtype', 'tie_method']
+    params = [['float64', 'float32', 'int64', 'datetime64'],
+              ['first', 'average', 'dense', 'min', 'max']]
+
+    def setup(self, dtype, tie_method):
+        N = 10**4
+        if dtype == 'datetime64':
+            data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
+        else:
+            data = np.array([1] * N, dtype=dtype)
+        self.df = DataFrame({'values': data, 'key': ['foo'] * N})
+
+    def time_rank_ties(self, dtype, tie_method):
+        self.df.groupby('key').rank(method=tie_method)
+
+
 class Float32(object):
     # GH 13335
     goal_time = 0.2
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -65,6 +65,7 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
+- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
 -
 
 .. _whatsnew_0240.docs:
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -429,7 +429,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
     is_datetimelike : bool, default False
         unused in this method but provided for call compatibility with other
         Cython transformations
-    ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
+    ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
+        'average'
         * average: average rank of group
         * min: lowest rank in group
         * max: highest rank in group
@@ -514,26 +515,22 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
             dups += 1
             sum_ranks += i - grp_start + 1
 
-            # if keep_na, check for missing values and assign back
-            # to the result where appropriate
-
-            if keep_na and mask[_as[i]]:
-                grp_na_count += 1
-                out[_as[i], 0] = nan
-            else:
-                # this implementation is inefficient because it will
-                # continue overwriting previously encountered dups
-                # i.e. if 5 duplicated values are encountered it will
-                # write to the result as follows (assumes avg tiebreaker):
-                # 1
-                # .5  .5
-                # .33 .33 .33
-                # .25 .25 .25 .25
-                # .2  .2  .2  .2  .2
-                #
-                # could potentially be optimized to only write to the
-                # result once the last duplicate value is encountered
-                if tiebreak == TIEBREAK_AVERAGE:
+            # Update out only when there is a transition of values or labels.
+            # When a new value or group is encountered, go back #dups steps(
+            # the number of occurrence of current value) and assign the ranks
+            # based on the the starting index of the current group (grp_start)
+            # and the current index
+            if (i == N - 1 or
+                    (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+                    (mask[_as[i]] ^ mask[_as[i+1]]) or
+                    (labels[_as[i]] != labels[_as[i+1]])):
+                # if keep_na, check for missing values and assign back
+                # to the result where appropriate
+                if keep_na and mask[_as[i]]:
+                    for j in range(i - dups + 1, i + 1):
+                        out[_as[j], 0] = nan
+                        grp_na_count = dups
+                elif tiebreak == TIEBREAK_AVERAGE:
                     for j in range(i - dups + 1, i + 1):
                         out[_as[j], 0] = sum_ranks / <float64_t>dups
                 elif tiebreak == TIEBREAK_MIN:
@@ -552,38 +549,38 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
                     for j in range(i - dups + 1, i + 1):
                         out[_as[j], 0] = grp_vals_seen
 
-            # look forward to the next value (using the sorting in _as)
-            # if the value does not equal the current value then we need to
-            # reset the dups and sum_ranks, knowing that a new value is coming
-            # up. the conditional also needs to handle nan equality and the
-            # end of iteration
-            if (i == N - 1 or
-                    (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
-                    (mask[_as[i]] ^ mask[_as[i+1]])):
-                dups = sum_ranks = 0
-                val_start = i
-                grp_vals_seen += 1
-                grp_tie_count +=1
-
-            # Similar to the previous conditional, check now if we are moving
-            # to a new group. If so, keep track of the index where the new
-            # group occurs, so the tiebreaker calculations can decrement that
-            # from their position. fill in the size of each group encountered
-            # (used by pct calculations later). also be sure to reset any of
-            # the items helping to calculate dups
-            if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
-                if tiebreak != TIEBREAK_DENSE:
-                    for j in range(grp_start, i + 1):
-                        grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
-                else:
-                    for j in range(grp_start, i + 1):
-                        grp_sizes[_as[j], 0] = (grp_tie_count -
-                                                (grp_na_count > 0))
-                dups = sum_ranks = 0
-                grp_na_count = 0
-                grp_tie_count = 0
-                grp_start = i + 1
-                grp_vals_seen = 1
+                # look forward to the next value (using the sorting in _as)
+                # if the value does not equal the current value then we need to
+                # reset the dups and sum_ranks, knowing that a new value is
+                # coming up. the conditional also needs to handle nan equality
+                # and the end of iteration
+                if (i == N - 1 or
+                        (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+                        (mask[_as[i]] ^ mask[_as[i+1]])):
+                    dups = sum_ranks = 0
+                    grp_vals_seen += 1
+                    grp_tie_count += 1
+
+                # Similar to the previous conditional, check now if we are
+                # moving to a new group. If so, keep track of the index where
+                # the new group occurs, so the tiebreaker calculations can
+                # decrement that from their position. fill in the size of each
+                # group encountered (used by pct calculations later). also be
+                # sure to reset any of the items helping to calculate dups
+                if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
+                    if tiebreak != TIEBREAK_DENSE:
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[_as[j], 0] = (i - grp_start + 1 -
+                                                    grp_na_count)
+                    else:
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[_as[j], 0] = (grp_tie_count -
+                                                    (grp_na_count > 0))
+                    dups = sum_ranks = 0
+                    grp_na_count = 0
+                    grp_tie_count = 0
+                    grp_start = i + 1
+                    grp_vals_seen = 1
 
         if pct:
             for i in range(N):

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@ Performance Improvements`
`65`	`65`	`~~~~~~~~~~~~~~~~~~~~~~~~`
`66`	`66`
`67`	`67`	- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
	`68`	+- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
`68`	`69`	`-`
`69`	`70`
`70`	`71`	`.. _whatsnew_0240.docs:`