Skip to content

Commit 0f9bea3

Browse files
rouzazarigfyoung
authored andcommitted
BUG: Dense ranking with percent now uses 100% basis
- `DataFrame.rank()` and `Series.rank()` when `method='dense'` and `pct=True` now scales to 100%. See pandas-dev#15630
1 parent d1f3689 commit 0f9bea3

File tree

5 files changed

+246
-2
lines changed

5 files changed

+246
-2
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1731,3 +1731,4 @@ Other
17311731
- Compat for 32-bit platforms for ``.qcut/cut``; bins will now be ``int64`` dtype (:issue:`14866`)
17321732
- Bug in interactions with ``Qt`` when a ``QtApplication`` already exists (:issue:`14372`)
17331733
- Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`)
1734+
- Bug in ``DataFrame.rank()`` and ``Series.rank()`` when ``method='dense'`` and ``pct=True`` (:issue:`15630`)

pandas/_libs/algos_rank_helper.pxi.in

+8-2
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,10 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
213213
sum_ranks = dups = 0
214214
{{endif}}
215215
if pct:
216-
return ranks / count
216+
if tiebreak == TIEBREAK_DENSE:
217+
return ranks / total_tie_count
218+
else:
219+
return ranks / count
217220
else:
218221
return ranks
219222

@@ -385,7 +388,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
385388
ranks[i, argsorted[i, z]] = total_tie_count
386389
sum_ranks = dups = 0
387390
if pct:
388-
ranks[i, :] /= count
391+
if tiebreak == TIEBREAK_DENSE:
392+
ranks[i, :] /= total_tie_count
393+
else:
394+
ranks[i, :] /= count
389395
if axis == 0:
390396
return ranks.T
391397
else:

pandas/tests/frame/test_rank.py

+8
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,14 @@ def test_rank_methods_frame(self):
217217
expected = expected.astype('float64')
218218
tm.assert_frame_equal(result, expected)
219219

220+
def test_rank_dense_(self):
221+
df = DataFrame([['2012', 'B', 3], ['2012', 'A', 2], ['2012', 'A', 1]])
222+
result = df.rank(method='dense', pct=True)
223+
expected = DataFrame([[1., 1., 1.],
224+
[1., 0.5, 2. / 3],
225+
[1., 0.5, 1. / 3]])
226+
assert_frame_equal(result, expected)
227+
220228
def test_rank_descending(self):
221229
dtypes = ['O', 'f8', 'i8']
222230

pandas/tests/series/test_rank.py

+19
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,25 @@ def test_rank_dense_method(self):
331331
expected = Series(exp).astype(result.dtype)
332332
assert_series_equal(result, expected)
333333

334+
def test_rank_dense_(self):
335+
# GH15630, pct should be on 100% basis even when method='dense'
336+
in_out = [([1], [1.]),
337+
([2], [1.]),
338+
([0], [1.]),
339+
([2, 2], [1., 1.]),
340+
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
341+
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
342+
([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
343+
([-5, -4, -3, -2, -1],
344+
[1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]
345+
346+
for ser, exp in in_out:
347+
for dtype in dtypes:
348+
s = Series(ser).astype(dtype)
349+
result = s.rank(method='dense', pct=True)
350+
expected = Series(exp).astype(result.dtype)
351+
assert_series_equal(result, expected)
352+
334353
def test_rank_descending(self):
335354
dtypes = ['O', 'f8', 'i8']
336355

pandas/tests/test_stats.py

+210
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
# -*- coding: utf-8 -*-
2+
from pandas import compat
3+
4+
from distutils.version import LooseVersion
5+
from numpy import nan
6+
import numpy as np
7+
8+
from pandas import Series, DataFrame
9+
10+
from pandas.compat import product
11+
from pandas.util.testing import (assert_frame_equal, assert_series_equal)
12+
import pandas.util.testing as tm
13+
14+
15+
class TestRank(tm.TestCase):
16+
s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3])
17+
df = DataFrame({'A': s, 'B': s})
18+
19+
results = {
20+
'average': np.array([1.5, 5.5, 7.0, 3.5, nan,
21+
3.5, 1.5, 8.0, nan, 5.5]),
22+
'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]),
23+
'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]),
24+
'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]),
25+
'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]),
26+
}
27+
28+
def test_rank_tie_methods(self):
29+
s = self.s
30+
31+
def _check(s, expected, method='average'):
32+
result = s.rank(method=method)
33+
tm.assert_series_equal(result, Series(expected))
34+
35+
dtypes = [None, object]
36+
disabled = set([(object, 'first')])
37+
results = self.results
38+
39+
for method, dtype in product(results, dtypes):
40+
if (dtype, method) in disabled:
41+
continue
42+
series = s if dtype is None else s.astype(dtype)
43+
_check(series, results[method], method=method)
44+
45+
def test_rank_methods_series(self):
46+
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
47+
import scipy
48+
from scipy.stats import rankdata
49+
50+
xs = np.random.randn(9)
51+
xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates
52+
np.random.shuffle(xs)
53+
54+
index = [chr(ord('a') + i) for i in range(len(xs))]
55+
56+
for vals in [xs, xs + 1e6, xs * 1e-6]:
57+
ts = Series(vals, index=index)
58+
59+
for m in ['average', 'min', 'max', 'first', 'dense']:
60+
result = ts.rank(method=m)
61+
sprank = rankdata(vals, m if m != 'first' else 'ordinal')
62+
expected = Series(sprank, index=index)
63+
64+
if LooseVersion(scipy.__version__) >= '0.17.0':
65+
expected = expected.astype('float64')
66+
tm.assert_series_equal(result, expected)
67+
68+
def test_rank_methods_frame(self):
69+
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
70+
import scipy
71+
from scipy.stats import rankdata
72+
73+
xs = np.random.randint(0, 21, (100, 26))
74+
xs = (xs - 10.0) / 10.0
75+
cols = [chr(ord('z') - i) for i in range(xs.shape[1])]
76+
77+
for vals in [xs, xs + 1e6, xs * 1e-6]:
78+
df = DataFrame(vals, columns=cols)
79+
80+
for ax in [0, 1]:
81+
for m in ['average', 'min', 'max', 'first', 'dense']:
82+
result = df.rank(axis=ax, method=m)
83+
sprank = np.apply_along_axis(
84+
rankdata, ax, vals,
85+
m if m != 'first' else 'ordinal')
86+
sprank = sprank.astype(np.float64)
87+
expected = DataFrame(sprank, columns=cols)
88+
89+
if LooseVersion(scipy.__version__) >= '0.17.0':
90+
expected = expected.astype('float64')
91+
tm.assert_frame_equal(result, expected)
92+
93+
def test_rank_dense_method(self):
94+
dtypes = ['O', 'f8', 'i8']
95+
in_out = [([1], [1]),
96+
([2], [1]),
97+
([0], [1]),
98+
([2, 2], [1, 1]),
99+
([1, 2, 3], [1, 2, 3]),
100+
([4, 2, 1], [3, 2, 1],),
101+
([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
102+
([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])]
103+
104+
for ser, exp in in_out:
105+
for dtype in dtypes:
106+
s = Series(ser).astype(dtype)
107+
result = s.rank(method='dense')
108+
expected = Series(exp).astype(result.dtype)
109+
assert_series_equal(result, expected)
110+
111+
# GH15630, pct should be on 100% basis even when method='dense'
112+
in_out = [([1], [1.]),
113+
([2], [1.]),
114+
([0], [1.]),
115+
([2, 2], [1., 1.1]),
116+
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
117+
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
118+
([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
119+
([-5, -4, -3, -2, -1],
120+
[1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]
121+
122+
for ser, exp in in_out:
123+
for dtype in dtypes:
124+
s = Series(ser).astype(dtype)
125+
result = s.rank(method='dense', pct=True)
126+
expected = Series(exp).astype(result.dtype)
127+
assert_series_equal(result, expected)
128+
129+
df = DataFrame([['2012', 'B', 3], ['2012', 'A', 2], ['2012', 'A', 1]])
130+
result = df.rank(method='dense', pct=True)
131+
expected = DataFrame([[1., 1., 1.],
132+
[1., 0.5, 2. / 3],
133+
[1., 0.5, 1. / 3]])
134+
assert_frame_equal(result, expected)
135+
136+
def test_rank_descending(self):
137+
dtypes = ['O', 'f8', 'i8']
138+
139+
for dtype, method in product(dtypes, self.results):
140+
if 'i' in dtype:
141+
s = self.s.dropna()
142+
df = self.df.dropna()
143+
else:
144+
s = self.s.astype(dtype)
145+
df = self.df.astype(dtype)
146+
147+
res = s.rank(ascending=False)
148+
expected = (s.max() - s).rank()
149+
assert_series_equal(res, expected)
150+
151+
res = df.rank(ascending=False)
152+
expected = (df.max() - df).rank()
153+
assert_frame_equal(res, expected)
154+
155+
if method == 'first' and dtype == 'O':
156+
continue
157+
158+
expected = (s.max() - s).rank(method=method)
159+
res2 = s.rank(method=method, ascending=False)
160+
assert_series_equal(res2, expected)
161+
162+
expected = (df.max() - df).rank(method=method)
163+
164+
if dtype != 'O':
165+
res2 = df.rank(method=method, ascending=False,
166+
numeric_only=True)
167+
assert_frame_equal(res2, expected)
168+
169+
res3 = df.rank(method=method, ascending=False,
170+
numeric_only=False)
171+
assert_frame_equal(res3, expected)
172+
173+
def test_rank_2d_tie_methods(self):
174+
df = self.df
175+
176+
def _check2d(df, expected, method='average', axis=0):
177+
exp_df = DataFrame({'A': expected, 'B': expected})
178+
179+
if axis == 1:
180+
df = df.T
181+
exp_df = exp_df.T
182+
183+
result = df.rank(method=method, axis=axis)
184+
assert_frame_equal(result, exp_df)
185+
186+
dtypes = [None, object]
187+
disabled = set([(object, 'first')])
188+
results = self.results
189+
190+
for method, axis, dtype in product(results, [0, 1], dtypes):
191+
if (dtype, method) in disabled:
192+
continue
193+
frame = df if dtype is None else df.astype(dtype)
194+
_check2d(frame, results[method], method=method, axis=axis)
195+
196+
def test_rank_int(self):
197+
s = self.s.dropna().astype('i8')
198+
199+
for method, res in compat.iteritems(self.results):
200+
result = s.rank(method=method)
201+
expected = Series(res).dropna()
202+
expected.index = result.index
203+
assert_series_equal(result, expected)
204+
205+
def test_rank_object_bug(self):
206+
# GH 13445
207+
208+
# smoke tests
209+
Series([np.nan] * 32).astype(object).rank(ascending=True)
210+
Series([np.nan] * 32).astype(object).rank(ascending=False)

0 commit comments

Comments
 (0)