Skip to content

Commit fbb05d4

Browse files
committed
PERF: improve performance of groupby rank (pandas-dev#21237)
1 parent ab668b0 commit fbb05d4

File tree

3 files changed

+71
-54
lines changed

3 files changed

+71
-54
lines changed

asv_bench/benchmarks/groupby.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import numpy as np
77
from pandas import (DataFrame, Series, MultiIndex, date_range, period_range,
8-
TimeGrouper, Categorical)
8+
TimeGrouper, Categorical, Timestamp)
99
import pandas.util.testing as tm
1010

1111
from .pandas_vb_common import setup # noqa
@@ -385,6 +385,25 @@ def time_dtype_as_field(self, dtype, method, application):
385385
self.as_field_method()
386386

387387

388+
class RankWithTies(object):
389+
# GH 21237
390+
goal_time = 0.2
391+
param_names = ['dtype', 'tie_method']
392+
params = [['float64', 'float32', 'int64', 'datetime64'],
393+
['first', 'average', 'dense', 'min', 'max']]
394+
395+
def setup(self, dtype, tie_method):
396+
N = 10**4
397+
if dtype == 'datetime64':
398+
data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
399+
else:
400+
data = np.array([1] * N, dtype=dtype)
401+
self.df = DataFrame({'values': data, 'key': ['foo'] * N})
402+
403+
def time_rank_ties(self, dtype, tie_method):
404+
self.df.groupby('key').rank(method=tie_method)
405+
406+
388407
class Float32(object):
389408
# GH 13335
390409
goal_time = 0.2

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ Performance Improvements
6565
~~~~~~~~~~~~~~~~~~~~~~~~
6666

6767
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
68+
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
6869
-
6970

7071
.. _whatsnew_0240.docs:

pandas/_libs/groupby_helper.pxi.in

+50-53
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
429429
is_datetimelike : bool, default False
430430
unused in this method but provided for call compatibility with other
431431
Cython transformations
432-
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
432+
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
433+
'average'
433434
* average: average rank of group
434435
* min: lowest rank in group
435436
* max: highest rank in group
@@ -514,26 +515,22 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
514515
dups += 1
515516
sum_ranks += i - grp_start + 1
516517

517-
# if keep_na, check for missing values and assign back
518-
# to the result where appropriate
519-
520-
if keep_na and mask[_as[i]]:
521-
grp_na_count += 1
522-
out[_as[i], 0] = nan
523-
else:
524-
# this implementation is inefficient because it will
525-
# continue overwriting previously encountered dups
526-
# i.e. if 5 duplicated values are encountered it will
527-
# write to the result as follows (assumes avg tiebreaker):
528-
# 1
529-
# .5 .5
530-
# .33 .33 .33
531-
# .25 .25 .25 .25
532-
# .2 .2 .2 .2 .2
533-
#
534-
# could potentially be optimized to only write to the
535-
# result once the last duplicate value is encountered
536-
if tiebreak == TIEBREAK_AVERAGE:
518+
# Update out only when there is a transition of values or labels.
519+
# When a new value or group is encountered, go back #dups steps(
520+
# the number of occurrence of current value) and assign the ranks
521+
# based on the the starting index of the current group (grp_start)
522+
# and the current index
523+
if (i == N - 1 or
524+
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
525+
(mask[_as[i]] ^ mask[_as[i+1]]) or
526+
(labels[_as[i]] != labels[_as[i+1]])):
527+
# if keep_na, check for missing values and assign back
528+
# to the result where appropriate
529+
if keep_na and mask[_as[i]]:
530+
for j in range(i - dups + 1, i + 1):
531+
out[_as[j], 0] = nan
532+
grp_na_count = dups
533+
elif tiebreak == TIEBREAK_AVERAGE:
537534
for j in range(i - dups + 1, i + 1):
538535
out[_as[j], 0] = sum_ranks / <float64_t>dups
539536
elif tiebreak == TIEBREAK_MIN:
@@ -552,38 +549,38 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
552549
for j in range(i - dups + 1, i + 1):
553550
out[_as[j], 0] = grp_vals_seen
554551

555-
# look forward to the next value (using the sorting in _as)
556-
# if the value does not equal the current value then we need to
557-
# reset the dups and sum_ranks, knowing that a new value is coming
558-
# up. the conditional also needs to handle nan equality and the
559-
# end of iteration
560-
if (i == N - 1 or
561-
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
562-
(mask[_as[i]] ^ mask[_as[i+1]])):
563-
dups = sum_ranks = 0
564-
val_start = i
565-
grp_vals_seen += 1
566-
grp_tie_count +=1
567-
568-
# Similar to the previous conditional, check now if we are moving
569-
# to a new group. If so, keep track of the index where the new
570-
# group occurs, so the tiebreaker calculations can decrement that
571-
# from their position. fill in the size of each group encountered
572-
# (used by pct calculations later). also be sure to reset any of
573-
# the items helping to calculate dups
574-
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
575-
if tiebreak != TIEBREAK_DENSE:
576-
for j in range(grp_start, i + 1):
577-
grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
578-
else:
579-
for j in range(grp_start, i + 1):
580-
grp_sizes[_as[j], 0] = (grp_tie_count -
581-
(grp_na_count > 0))
582-
dups = sum_ranks = 0
583-
grp_na_count = 0
584-
grp_tie_count = 0
585-
grp_start = i + 1
586-
grp_vals_seen = 1
552+
# look forward to the next value (using the sorting in _as)
553+
# if the value does not equal the current value then we need to
554+
# reset the dups and sum_ranks, knowing that a new value is
555+
# coming up. the conditional also needs to handle nan equality
556+
# and the end of iteration
557+
if (i == N - 1 or
558+
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
559+
(mask[_as[i]] ^ mask[_as[i+1]])):
560+
dups = sum_ranks = 0
561+
grp_vals_seen += 1
562+
grp_tie_count += 1
563+
564+
# Similar to the previous conditional, check now if we are
565+
# moving to a new group. If so, keep track of the index where
566+
# the new group occurs, so the tiebreaker calculations can
567+
# decrement that from their position. fill in the size of each
568+
# group encountered (used by pct calculations later). also be
569+
# sure to reset any of the items helping to calculate dups
570+
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
571+
if tiebreak != TIEBREAK_DENSE:
572+
for j in range(grp_start, i + 1):
573+
grp_sizes[_as[j], 0] = (i - grp_start + 1 -
574+
grp_na_count)
575+
else:
576+
for j in range(grp_start, i + 1):
577+
grp_sizes[_as[j], 0] = (grp_tie_count -
578+
(grp_na_count > 0))
579+
dups = sum_ranks = 0
580+
grp_na_count = 0
581+
grp_tie_count = 0
582+
grp_start = i + 1
583+
grp_vals_seen = 1
587584

588585
if pct:
589586
for i in range(N):

0 commit comments

Comments
 (0)