diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 7777322071957..0725bbeb6c36d 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -5,7 +5,7 @@ import numpy as np from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, - TimeGrouper, Categorical) + TimeGrouper, Categorical, Timestamp) import pandas.util.testing as tm from .pandas_vb_common import setup # noqa @@ -385,6 +385,25 @@ def time_dtype_as_field(self, dtype, method, application): self.as_field_method() +class RankWithTies(object): + # GH 21237 + goal_time = 0.2 + param_names = ['dtype', 'tie_method'] + params = [['float64', 'float32', 'int64', 'datetime64'], + ['first', 'average', 'dense', 'min', 'max']] + + def setup(self, dtype, tie_method): + N = 10**4 + if dtype == 'datetime64': + data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) + else: + data = np.array([1] * N, dtype=dtype) + self.df = DataFrame({'values': data, 'key': ['foo'] * N}) + + def time_rank_ties(self, dtype, tie_method): + self.df.groupby('key').rank(method=tie_method) + + class Float32(object): # GH 13335 goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 68c1839221508..eaeda8bf190da 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -65,6 +65,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - .. _whatsnew_0240.docs: diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index b3e9b7c9e69ee..0062a6c8d31ab 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -429,7 +429,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, is_datetimelike : bool, default False unused in this method but provided for call compatibility with other Cython transformations - ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + ties_method : {'average', 'min', 'max', 'first', 'dense'}, default + 'average' * average: average rank of group * min: lowest rank in group * max: highest rank in group @@ -514,26 +515,22 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, dups += 1 sum_ranks += i - grp_start + 1 - # if keep_na, check for missing values and assign back - # to the result where appropriate - - if keep_na and mask[_as[i]]: - grp_na_count += 1 - out[_as[i], 0] = nan - else: - # this implementation is inefficient because it will - # continue overwriting previously encountered dups - # i.e. if 5 duplicated values are encountered it will - # write to the result as follows (assumes avg tiebreaker): - # 1 - # .5 .5 - # .33 .33 .33 - # .25 .25 .25 .25 - # .2 .2 .2 .2 .2 - # - # could potentially be optimized to only write to the - # result once the last duplicate value is encountered - if tiebreak == TIEBREAK_AVERAGE: + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the the starting index of the current group (grp_start) + # and the current index + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]]) or + (labels[_as[i]] != labels[_as[i+1]])): + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and mask[_as[i]]: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = nan + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): out[_as[j], 0] = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: @@ -552,38 +549,38 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, for j in range(i - dups + 1, i + 1): out[_as[j], 0] = grp_vals_seen - # look forward to the next value (using the sorting in _as) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is coming - # up. the conditional also needs to handle nan equality and the - # end of iteration - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]])): - dups = sum_ranks = 0 - val_start = i - grp_vals_seen += 1 - grp_tie_count +=1 - - # Similar to the previous conditional, check now if we are moving - # to a new group. If so, keep track of the index where the new - # group occurs, so the tiebreaker calculations can decrement that - # from their position. fill in the size of each group encountered - # (used by pct calculations later). also be sure to reset any of - # the items helping to calculate dups - if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - if tiebreak != TIEBREAK_DENSE: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count - else: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (grp_tie_count - - (grp_na_count > 0)) - dups = sum_ranks = 0 - grp_na_count = 0 - grp_tie_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. the conditional also needs to handle nan equality + # and the end of iteration + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]])): + dups = sum_ranks = 0 + grp_vals_seen += 1 + grp_tie_count += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. fill in the size of each + # group encountered (used by pct calculations later). also be + # sure to reset any of the items helping to calculate dups + if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (i - grp_start + 1 - + grp_na_count) + else: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (grp_tie_count - + (grp_na_count > 0)) + dups = sum_ranks = 0 + grp_na_count = 0 + grp_tie_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 if pct: for i in range(N):