Skip to content

Commit ea077d3

Browse files
committed
BUG: Dense ranking with percent now uses 100% basis
- `DataFrame.rank()` and `Series.rank()` when `method='dense'` and `pct=True` now scales to 100%. See #15630
1 parent e4e87ec commit ea077d3

File tree

5 files changed

+246
-2
lines changed

5 files changed

+246
-2
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1113,3 +1113,4 @@ Other
11131113
- Compat for 32-bit platforms for ``.qcut/cut``; bins will now be ``int64`` dtype (:issue:`14866`)
11141114
- Bug in interactions with ``Qt`` when a ``QtApplication`` already exists (:issue:`14372`)
11151115
- Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`)
1116+
- Bug in ``DataFrame.rank()`` and ``Series.rank()`` when ``method='dense'`` and ``pct=True`` (:issue:`15630`)

pandas/_libs/algos_rank_helper.pxi.in

+8-2
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,10 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
198198
sum_ranks = dups = 0
199199
{{endif}}
200200
if pct:
201-
return ranks / count
201+
if tiebreak == TIEBREAK_DENSE:
202+
return ranks / total_tie_count
203+
else:
204+
return ranks / count
202205
else:
203206
return ranks
204207

@@ -370,7 +373,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
370373
ranks[i, argsorted[i, z]] = total_tie_count
371374
sum_ranks = dups = 0
372375
if pct:
373-
ranks[i, :] /= count
376+
if tiebreak == TIEBREAK_DENSE:
377+
ranks[i, :] /= total_tie_count
378+
else:
379+
ranks[i, :] /= count
374380
if axis == 0:
375381
return ranks.T
376382
else:

pandas/tests/frame/test_rank.py

+8
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,14 @@ def test_rank_methods_frame(self):
218218
expected = expected.astype('float64')
219219
tm.assert_frame_equal(result, expected)
220220

221+
def test_rank_dense_(self):
222+
df = DataFrame([['2012', 'B', 3], ['2012', 'A', 2], ['2012', 'A', 1]])
223+
result = df.rank(method='dense', pct=True)
224+
expected = DataFrame([[1., 1., 1.],
225+
[1., 0.5, 2. / 3],
226+
[1., 0.5, 1. / 3]])
227+
assert_frame_equal(result, expected)
228+
221229
def test_rank_descending(self):
222230
dtypes = ['O', 'f8', 'i8']
223231

pandas/tests/series/test_rank.py

+19
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,25 @@ def test_rank_dense_method(self):
287287
expected = Series(exp).astype(result.dtype)
288288
assert_series_equal(result, expected)
289289

290+
def test_rank_dense_(self):
291+
# GH15630, pct should be on 100% basis even when method='dense'
292+
in_out = [([1], [1.]),
293+
([2], [1.]),
294+
([0], [1.]),
295+
([2, 2], [1., 1.]),
296+
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
297+
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
298+
([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
299+
([-5, -4, -3, -2, -1],
300+
[1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]
301+
302+
for ser, exp in in_out:
303+
for dtype in dtypes:
304+
s = Series(ser).astype(dtype)
305+
result = s.rank(method='dense', pct=True)
306+
expected = Series(exp).astype(result.dtype)
307+
assert_series_equal(result, expected)
308+
290309
def test_rank_descending(self):
291310
dtypes = ['O', 'f8', 'i8']
292311

pandas/tests/test_stats.py

+210
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
# -*- coding: utf-8 -*-
2+
from pandas import compat
3+
4+
from distutils.version import LooseVersion
5+
from numpy import nan
6+
import numpy as np
7+
8+
from pandas import Series, DataFrame
9+
10+
from pandas.compat import product
11+
from pandas.util.testing import (assert_frame_equal, assert_series_equal)
12+
import pandas.util.testing as tm
13+
14+
15+
class TestRank(tm.TestCase):
16+
s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3])
17+
df = DataFrame({'A': s, 'B': s})
18+
19+
results = {
20+
'average': np.array([1.5, 5.5, 7.0, 3.5, nan,
21+
3.5, 1.5, 8.0, nan, 5.5]),
22+
'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]),
23+
'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]),
24+
'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]),
25+
'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]),
26+
}
27+
28+
def test_rank_tie_methods(self):
29+
s = self.s
30+
31+
def _check(s, expected, method='average'):
32+
result = s.rank(method=method)
33+
tm.assert_series_equal(result, Series(expected))
34+
35+
dtypes = [None, object]
36+
disabled = set([(object, 'first')])
37+
results = self.results
38+
39+
for method, dtype in product(results, dtypes):
40+
if (dtype, method) in disabled:
41+
continue
42+
series = s if dtype is None else s.astype(dtype)
43+
_check(series, results[method], method=method)
44+
45+
def test_rank_methods_series(self):
46+
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
47+
import scipy
48+
from scipy.stats import rankdata
49+
50+
xs = np.random.randn(9)
51+
xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates
52+
np.random.shuffle(xs)
53+
54+
index = [chr(ord('a') + i) for i in range(len(xs))]
55+
56+
for vals in [xs, xs + 1e6, xs * 1e-6]:
57+
ts = Series(vals, index=index)
58+
59+
for m in ['average', 'min', 'max', 'first', 'dense']:
60+
result = ts.rank(method=m)
61+
sprank = rankdata(vals, m if m != 'first' else 'ordinal')
62+
expected = Series(sprank, index=index)
63+
64+
if LooseVersion(scipy.__version__) >= '0.17.0':
65+
expected = expected.astype('float64')
66+
tm.assert_series_equal(result, expected)
67+
68+
def test_rank_methods_frame(self):
69+
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
70+
import scipy
71+
from scipy.stats import rankdata
72+
73+
xs = np.random.randint(0, 21, (100, 26))
74+
xs = (xs - 10.0) / 10.0
75+
cols = [chr(ord('z') - i) for i in range(xs.shape[1])]
76+
77+
for vals in [xs, xs + 1e6, xs * 1e-6]:
78+
df = DataFrame(vals, columns=cols)
79+
80+
for ax in [0, 1]:
81+
for m in ['average', 'min', 'max', 'first', 'dense']:
82+
result = df.rank(axis=ax, method=m)
83+
sprank = np.apply_along_axis(
84+
rankdata, ax, vals,
85+
m if m != 'first' else 'ordinal')
86+
sprank = sprank.astype(np.float64)
87+
expected = DataFrame(sprank, columns=cols)
88+
89+
if LooseVersion(scipy.__version__) >= '0.17.0':
90+
expected = expected.astype('float64')
91+
tm.assert_frame_equal(result, expected)
92+
93+
def test_rank_dense_method(self):
94+
dtypes = ['O', 'f8', 'i8']
95+
in_out = [([1], [1]),
96+
([2], [1]),
97+
([0], [1]),
98+
([2, 2], [1, 1]),
99+
([1, 2, 3], [1, 2, 3]),
100+
([4, 2, 1], [3, 2, 1],),
101+
([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
102+
([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])]
103+
104+
for ser, exp in in_out:
105+
for dtype in dtypes:
106+
s = Series(ser).astype(dtype)
107+
result = s.rank(method='dense')
108+
expected = Series(exp).astype(result.dtype)
109+
assert_series_equal(result, expected)
110+
111+
# GH15630, pct should be on 100% basis even when method='dense'
112+
in_out = [([1], [1.]),
113+
([2], [1.]),
114+
([0], [1.]),
115+
([2, 2], [1., 1.1]),
116+
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
117+
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
118+
([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
119+
([-5, -4, -3, -2, -1],
120+
[1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]
121+
122+
for ser, exp in in_out:
123+
for dtype in dtypes:
124+
s = Series(ser).astype(dtype)
125+
result = s.rank(method='dense', pct=True)
126+
expected = Series(exp).astype(result.dtype)
127+
assert_series_equal(result, expected)
128+
129+
df = DataFrame([['2012', 'B', 3], ['2012', 'A', 2], ['2012', 'A', 1]])
130+
result = df.rank(method='dense', pct=True)
131+
expected = DataFrame([[1., 1., 1.],
132+
[1., 0.5, 2. / 3],
133+
[1., 0.5, 1. / 3]])
134+
assert_frame_equal(result, expected)
135+
136+
def test_rank_descending(self):
137+
dtypes = ['O', 'f8', 'i8']
138+
139+
for dtype, method in product(dtypes, self.results):
140+
if 'i' in dtype:
141+
s = self.s.dropna()
142+
df = self.df.dropna()
143+
else:
144+
s = self.s.astype(dtype)
145+
df = self.df.astype(dtype)
146+
147+
res = s.rank(ascending=False)
148+
expected = (s.max() - s).rank()
149+
assert_series_equal(res, expected)
150+
151+
res = df.rank(ascending=False)
152+
expected = (df.max() - df).rank()
153+
assert_frame_equal(res, expected)
154+
155+
if method == 'first' and dtype == 'O':
156+
continue
157+
158+
expected = (s.max() - s).rank(method=method)
159+
res2 = s.rank(method=method, ascending=False)
160+
assert_series_equal(res2, expected)
161+
162+
expected = (df.max() - df).rank(method=method)
163+
164+
if dtype != 'O':
165+
res2 = df.rank(method=method, ascending=False,
166+
numeric_only=True)
167+
assert_frame_equal(res2, expected)
168+
169+
res3 = df.rank(method=method, ascending=False,
170+
numeric_only=False)
171+
assert_frame_equal(res3, expected)
172+
173+
def test_rank_2d_tie_methods(self):
174+
df = self.df
175+
176+
def _check2d(df, expected, method='average', axis=0):
177+
exp_df = DataFrame({'A': expected, 'B': expected})
178+
179+
if axis == 1:
180+
df = df.T
181+
exp_df = exp_df.T
182+
183+
result = df.rank(method=method, axis=axis)
184+
assert_frame_equal(result, exp_df)
185+
186+
dtypes = [None, object]
187+
disabled = set([(object, 'first')])
188+
results = self.results
189+
190+
for method, axis, dtype in product(results, [0, 1], dtypes):
191+
if (dtype, method) in disabled:
192+
continue
193+
frame = df if dtype is None else df.astype(dtype)
194+
_check2d(frame, results[method], method=method, axis=axis)
195+
196+
def test_rank_int(self):
197+
s = self.s.dropna().astype('i8')
198+
199+
for method, res in compat.iteritems(self.results):
200+
result = s.rank(method=method)
201+
expected = Series(res).dropna()
202+
expected.index = result.index
203+
assert_series_equal(result, expected)
204+
205+
def test_rank_object_bug(self):
206+
# GH 13445
207+
208+
# smoke tests
209+
Series([np.nan] * 32).astype(object).rank(ascending=True)
210+
Series([np.nan] * 32).astype(object).rank(ascending=False)

0 commit comments

Comments
 (0)