Skip to content

Commit 1d73cf3

Browse files
rouzazarijreback
authored andcommitted
BUG: Dense ranking with percent now uses 100% basis (pandas-dev#15639)
1 parent f9fd540 commit 1d73cf3

File tree

4 files changed

+139
-8
lines changed

4 files changed

+139
-8
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,7 @@ Offsets
907907

908908
Numeric
909909
^^^^^^^
910+
- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`)
910911
- Bug in :class:`Series` constructor with an int or float list where specifying ``dtype=str``, ``dtype='str'`` or ``dtype='U'`` failed to convert the data elements to strings (:issue:`16605`)
911912
- Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`)
912913
- Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`)

pandas/_libs/algos_rank_helper.pxi.in

+8-2
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,10 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
213213
sum_ranks = dups = 0
214214
{{endif}}
215215
if pct:
216-
return ranks / count
216+
if tiebreak == TIEBREAK_DENSE:
217+
return ranks / total_tie_count
218+
else:
219+
return ranks / count
217220
else:
218221
return ranks
219222

@@ -385,7 +388,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
385388
ranks[i, argsorted[i, z]] = total_tie_count
386389
sum_ranks = dups = 0
387390
if pct:
388-
ranks[i, :] /= count
391+
if tiebreak == TIEBREAK_DENSE:
392+
ranks[i, :] /= total_tie_count
393+
else:
394+
ranks[i, :] /= count
389395
if axis == 0:
390396
return ranks.T
391397
else:

pandas/tests/frame/test_rank.py

+37-6
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
# -*- coding: utf-8 -*-
22
import pytest
3-
from datetime import timedelta, datetime
4-
from distutils.version import LooseVersion
5-
from numpy import nan
63
import numpy as np
4+
import pandas.util.testing as tm
75

8-
from pandas import Series, DataFrame
6+
from distutils.version import LooseVersion
7+
from datetime import timedelta, datetime
8+
from numpy import nan
99

10-
from pandas.compat import product
1110
from pandas.util.testing import assert_frame_equal
12-
import pandas.util.testing as tm
1311
from pandas.tests.frame.common import TestData
12+
from pandas import Series, DataFrame
13+
from pandas.compat import product
1414

1515

1616
class TestRank(TestData):
@@ -266,3 +266,34 @@ def _check2d(df, expected, method='average', axis=0):
266266
continue
267267
frame = df if dtype is None else df.astype(dtype)
268268
_check2d(frame, results[method], method=method, axis=axis)
269+
270+
271+
@pytest.mark.parametrize(
272+
"method,exp", [("dense",
273+
[[1., 1., 1.],
274+
[1., 0.5, 2. / 3],
275+
[1., 0.5, 1. / 3]]),
276+
("min",
277+
[[1. / 3, 1., 1.],
278+
[1. / 3, 1. / 3, 2. / 3],
279+
[1. / 3, 1. / 3, 1. / 3]]),
280+
("max",
281+
[[1., 1., 1.],
282+
[1., 2. / 3, 2. / 3],
283+
[1., 2. / 3, 1. / 3]]),
284+
("average",
285+
[[2. / 3, 1., 1.],
286+
[2. / 3, 0.5, 2. / 3],
287+
[2. / 3, 0.5, 1. / 3]]),
288+
("first",
289+
[[1. / 3, 1., 1.],
290+
[2. / 3, 1. / 3, 2. / 3],
291+
[3. / 3, 2. / 3, 1. / 3]])])
292+
def test_rank_pct_true(method, exp):
293+
# see gh-15630.
294+
295+
df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
296+
result = df.rank(method=method, pct=True)
297+
298+
expected = DataFrame(exp)
299+
tm.assert_frame_equal(result, expected)

pandas/tests/series/test_rank.py

+93
Original file line numberDiff line numberDiff line change
@@ -376,3 +376,96 @@ def test_rank_modify_inplace(self):
376376
s.rank()
377377
result = s
378378
assert_series_equal(result, expected)
379+
380+
381+
# GH15630, pct should be on 100% basis when method='dense'
382+
383+
@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
384+
@pytest.mark.parametrize('ser, exp', [
385+
([1], [1.]),
386+
([1, 2], [1. / 2, 2. / 2]),
387+
([2, 2], [1., 1.]),
388+
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
389+
([1, 2, 2], [1. / 2, 2. / 2, 2. / 2]),
390+
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
391+
([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
392+
([1, 1, 3, 3, 5, 5], [1. / 3, 1. / 3, 2. / 3, 2. / 3, 3. / 3, 3. / 3]),
393+
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
394+
def test_rank_dense_pct(dtype, ser, exp):
395+
s = Series(ser).astype(dtype)
396+
result = s.rank(method='dense', pct=True)
397+
expected = Series(exp).astype(result.dtype)
398+
assert_series_equal(result, expected)
399+
400+
401+
@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
402+
@pytest.mark.parametrize('ser, exp', [
403+
([1], [1.]),
404+
([1, 2], [1. / 2, 2. / 2]),
405+
([2, 2], [1. / 2, 1. / 2]),
406+
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
407+
([1, 2, 2], [1. / 3, 2. / 3, 2. / 3]),
408+
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
409+
([1, 1, 5, 5, 3], [1. / 5, 1. / 5, 4. / 5, 4. / 5, 3. / 5]),
410+
([1, 1, 3, 3, 5, 5], [1. / 6, 1. / 6, 3. / 6, 3. / 6, 5. / 6, 5. / 6]),
411+
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
412+
def test_rank_min_pct(dtype, ser, exp):
413+
s = Series(ser).astype(dtype)
414+
result = s.rank(method='min', pct=True)
415+
expected = Series(exp).astype(result.dtype)
416+
assert_series_equal(result, expected)
417+
418+
419+
@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
420+
@pytest.mark.parametrize('ser, exp', [
421+
([1], [1.]),
422+
([1, 2], [1. / 2, 2. / 2]),
423+
([2, 2], [1., 1.]),
424+
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
425+
([1, 2, 2], [1. / 3, 3. / 3, 3. / 3]),
426+
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
427+
([1, 1, 5, 5, 3], [2. / 5, 2. / 5, 5. / 5, 5. / 5, 3. / 5]),
428+
([1, 1, 3, 3, 5, 5], [2. / 6, 2. / 6, 4. / 6, 4. / 6, 6. / 6, 6. / 6]),
429+
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
430+
def test_rank_max_pct(dtype, ser, exp):
431+
s = Series(ser).astype(dtype)
432+
result = s.rank(method='max', pct=True)
433+
expected = Series(exp).astype(result.dtype)
434+
assert_series_equal(result, expected)
435+
436+
437+
@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
438+
@pytest.mark.parametrize('ser, exp', [
439+
([1], [1.]),
440+
([1, 2], [1. / 2, 2. / 2]),
441+
([2, 2], [1.5 / 2, 1.5 / 2]),
442+
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
443+
([1, 2, 2], [1. / 3, 2.5 / 3, 2.5 / 3]),
444+
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
445+
([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3. / 5]),
446+
([1, 1, 3, 3, 5, 5],
447+
[1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]),
448+
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
449+
def test_rank_average_pct(dtype, ser, exp):
450+
s = Series(ser).astype(dtype)
451+
result = s.rank(method='average', pct=True)
452+
expected = Series(exp).astype(result.dtype)
453+
assert_series_equal(result, expected)
454+
455+
456+
@pytest.mark.parametrize('dtype', ['f8', 'i8'])
457+
@pytest.mark.parametrize('ser, exp', [
458+
([1], [1.]),
459+
([1, 2], [1. / 2, 2. / 2]),
460+
([2, 2], [1. / 2, 2. / 2.]),
461+
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
462+
([1, 2, 2], [1. / 3, 2. / 3, 3. / 3]),
463+
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
464+
([1, 1, 5, 5, 3], [1. / 5, 2. / 5, 4. / 5, 5. / 5, 3. / 5]),
465+
([1, 1, 3, 3, 5, 5], [1. / 6, 2. / 6, 3. / 6, 4. / 6, 5. / 6, 6. / 6]),
466+
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
467+
def test_rank_first_pct(dtype, ser, exp):
468+
s = Series(ser).astype(dtype)
469+
result = s.rank(method='first', pct=True)
470+
expected = Series(exp).astype(result.dtype)
471+
assert_series_equal(result, expected)

0 commit comments

Comments
 (0)