Skip to content

Commit 61b141b

Browse files
dsm054jreback
authored andcommitted
ENH: add method='dense' to rank
1 parent 6efa4c1 commit 61b141b

File tree

5 files changed

+61
-10
lines changed

5 files changed

+61
-10
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ Improvements to existing features
127127
- Performance improvement in indexing into a multi-indexed Series (:issue:`5567`)
128128
- Testing statements updated to use specialized asserts (:issue:`6175`)
129129
- ``Series.rank()`` now has a percentage rank option (:issue:`5971`)
130+
- ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`)
130131
- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
131132
using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)
132133
- perf improvements in DataFrame construction with certain offsets, by removing faulty caching

pandas/algos.pyx

+36-6
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,14 @@ cdef:
6868
int TIEBREAK_MAX = 2
6969
int TIEBREAK_FIRST = 3
7070
int TIEBREAK_FIRST_DESCENDING = 4
71+
int TIEBREAK_DENSE = 5
7172

7273
tiebreakers = {
7374
'average' : TIEBREAK_AVERAGE,
7475
'min' : TIEBREAK_MIN,
7576
'max' : TIEBREAK_MAX,
76-
'first' : TIEBREAK_FIRST
77+
'first' : TIEBREAK_FIRST,
78+
'dense' : TIEBREAK_DENSE,
7779
}
7880

7981

@@ -137,7 +139,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
137139
"""
138140

139141
cdef:
140-
Py_ssize_t i, j, n, dups = 0
142+
Py_ssize_t i, j, n, dups = 0, total_tie_count = 0
141143
ndarray[float64_t] sorted_data, ranks, values
142144
ndarray[int64_t] argsorted
143145
float64_t val, nan_value
@@ -200,6 +202,10 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
200202
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
201203
for j in range(i - dups + 1, i + 1):
202204
ranks[argsorted[j]] = 2 * i - j - dups + 2
205+
elif tiebreak == TIEBREAK_DENSE:
206+
total_tie_count += 1
207+
for j in range(i - dups + 1, i + 1):
208+
ranks[argsorted[j]] = total_tie_count
203209
sum_ranks = dups = 0
204210
if pct:
205211
return ranks / count
@@ -214,7 +220,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
214220
"""
215221

216222
cdef:
217-
Py_ssize_t i, j, n, dups = 0
223+
Py_ssize_t i, j, n, dups = 0, total_tie_count = 0
218224
ndarray[int64_t] sorted_data, values
219225
ndarray[float64_t] ranks
220226
ndarray[int64_t] argsorted
@@ -265,6 +271,10 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
265271
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
266272
for j in range(i - dups + 1, i + 1):
267273
ranks[argsorted[j]] = 2 * i - j - dups + 2
274+
elif tiebreak == TIEBREAK_DENSE:
275+
total_tie_count += 1
276+
for j in range(i - dups + 1, i + 1):
277+
ranks[argsorted[j]] = total_tie_count
268278
sum_ranks = dups = 0
269279
if pct:
270280
return ranks / count
@@ -279,7 +289,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
279289
"""
280290

281291
cdef:
282-
Py_ssize_t i, j, z, k, n, dups = 0
292+
Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
283293
ndarray[float64_t, ndim=2] ranks, values
284294
ndarray[int64_t, ndim=2] argsorted
285295
float64_t val, nan_value
@@ -324,6 +334,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
324334

325335
for i in range(n):
326336
dups = sum_ranks = 0
337+
total_tie_count = 0
327338
for j in range(k):
328339
sum_ranks += j + 1
329340
dups += 1
@@ -347,6 +358,10 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
347358
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
348359
for z in range(j - dups + 1, j + 1):
349360
ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
361+
elif tiebreak == TIEBREAK_DENSE:
362+
total_tie_count += 1
363+
for z in range(j - dups + 1, j + 1):
364+
ranks[i, argsorted[i, z]] = total_tie_count
350365
sum_ranks = dups = 0
351366

352367
if axis == 0:
@@ -362,7 +377,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
362377
"""
363378

364379
cdef:
365-
Py_ssize_t i, j, z, k, n, dups = 0
380+
Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
366381
ndarray[float64_t, ndim=2] ranks
367382
ndarray[int64_t, ndim=2] argsorted
368383
ndarray[int64_t, ndim=2, cast=True] values
@@ -395,6 +410,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
395410

396411
for i in range(n):
397412
dups = sum_ranks = 0
413+
total_tie_count = 0
398414
for j in range(k):
399415
sum_ranks += j + 1
400416
dups += 1
@@ -415,6 +431,10 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
415431
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
416432
for z in range(j - dups + 1, j + 1):
417433
ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
434+
elif tiebreak == TIEBREAK_DENSE:
435+
total_tie_count += 1
436+
for z in range(j - dups + 1, j + 1):
437+
ranks[i, argsorted[i, z]] = total_tie_count
418438
sum_ranks = dups = 0
419439

420440
if axis == 0:
@@ -430,7 +450,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
430450
"""
431451

432452
cdef:
433-
Py_ssize_t i, j, n, dups = 0
453+
Py_ssize_t i, j, n, dups = 0, total_tie_count = 0
434454
ndarray[float64_t] ranks
435455
ndarray sorted_data, values
436456
ndarray[int64_t] argsorted
@@ -502,6 +522,10 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
502522
ranks[argsorted[j]] = i + 1
503523
elif tiebreak == TIEBREAK_FIRST:
504524
raise ValueError('first not supported for non-numeric data')
525+
elif tiebreak == TIEBREAK_DENSE:
526+
total_tie_count += 1
527+
for j in range(i - dups + 1, i + 1):
528+
ranks[argsorted[j]] = total_tie_count
505529
sum_ranks = dups = 0
506530
if pct:
507531
ranks / count
@@ -545,6 +569,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
545569

546570
cdef:
547571
Py_ssize_t i, j, z, k, n, infs, dups = 0
572+
Py_ssize_t total_tie_count = 0
548573
ndarray[float64_t, ndim=2] ranks
549574
ndarray[object, ndim=2] values
550575
ndarray[int64_t, ndim=2] argsorted
@@ -600,6 +625,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
600625

601626
for i in range(n):
602627
dups = sum_ranks = infs = 0
628+
total_tie_count = 0
603629
for j in range(k):
604630
val = values[i, j]
605631
if val is nan_value and keep_na:
@@ -621,6 +647,10 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
621647
elif tiebreak == TIEBREAK_FIRST:
622648
raise ValueError('first not supported for '
623649
'non-numeric data')
650+
elif tiebreak == TIEBREAK_DENSE:
651+
total_tie_count += 1
652+
for z in range(j - dups + 1, j + 1):
653+
ranks[i, argsorted[i, z]] = total_tie_count
624654
sum_ranks = dups = 0
625655

626656
if axis == 0:

pandas/core/frame.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -4182,11 +4182,12 @@ def rank(self, axis=0, numeric_only=None, method='average',
41824182
Ranks over columns (0) or rows (1)
41834183
numeric_only : boolean, default None
41844184
Include only float, int, boolean data
4185-
method : {'average', 'min', 'max', 'first'}
4185+
method : {'average', 'min', 'max', 'first', 'dense'}
41864186
* average: average rank of group
41874187
* min: lowest rank in group
41884188
* max: highest rank in group
41894189
* first: ranks assigned in order they appear in the array
4190+
* dense: like 'min', but rank always increases by 1 between groups
41904191
na_option : {'keep', 'top', 'bottom'}
41914192
* keep: leave NA values where they are
41924193
* top: smallest rank if ascending

pandas/core/series.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1720,11 +1720,12 @@ def rank(self, method='average', na_option='keep', ascending=True,
17201720
17211721
Parameters
17221722
----------
1723-
method : {'average', 'min', 'max', 'first'}
1723+
method : {'average', 'min', 'max', 'first', 'dense'}
17241724
* average: average rank of group
17251725
* min: lowest rank in group
17261726
* max: highest rank in group
17271727
* first: ranks assigned in order they appear in the array
1728+
* dense: like 'min', but rank always increases by 1 between groups
17281729
na_option : {'keep'}
17291730
keep: leave NA values where they are
17301731
ascending : boolean, default True

pandas/tests/test_stats.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
assert_almost_equal)
1313
import pandas.util.testing as tm
1414

15-
1615
class TestRank(tm.TestCase):
1716
_multiprocess_can_split_ = True
1817
s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3])
@@ -23,7 +22,8 @@ class TestRank(tm.TestCase):
2322
3.5, 1.5, 8.0, nan, 5.5]),
2423
'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]),
2524
'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]),
26-
'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6])
25+
'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]),
26+
'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]),
2727
}
2828

2929
def test_rank_tie_methods(self):
@@ -43,6 +43,24 @@ def _check(s, expected, method='average'):
4343
series = s if dtype is None else s.astype(dtype)
4444
_check(series, results[method], method=method)
4545

46+
def test_rank_dense_method(self):
47+
dtypes = ['O', 'f8', 'i8']
48+
in_out = [([1], [1]),
49+
([2], [1]),
50+
([0], [1]),
51+
([2,2], [1,1]),
52+
([1,2,3], [1,2,3]),
53+
([4,2,1], [3,2,1],),
54+
([1,1,5,5,3], [1,1,3,3,2]),
55+
([-5,-4,-3,-2,-1], [1,2,3,4,5])]
56+
57+
for ser, exp in in_out:
58+
for dtype in dtypes:
59+
s = Series(ser).astype(dtype)
60+
result = s.rank(method='dense')
61+
expected = Series(exp).astype(result.dtype)
62+
assert_series_equal(result, expected)
63+
4664
def test_rank_descending(self):
4765
dtypes = ['O', 'f8', 'i8']
4866

0 commit comments

Comments
 (0)