Skip to content

Commit 0d35dd4

Browse files
committed
Merge pull request pandas-dev#8379 from behzadnouri/rank-tol
BUG: floats cannot be ranked with tolerance
2 parents 9ac01a7 + 6c11ac2 commit 0d35dd4

File tree

4 files changed

+43
-18
lines changed

4 files changed

+43
-18
lines changed

doc/source/whatsnew/v0.16.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,7 @@ Bug Fixes
349349
- Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`).
350350
- Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`).
351351
- Bug in ``unstack`` with ``TimedeltaIndex`` or ``DatetimeIndex`` and nulls (:issue:`9491`).
352+
- Bug in ``rank`` where comparing floats with tolerance will cause inconsistent behaviour (:issue:`8365`).
352353

353354

354355
- Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`).

pandas/algos.pyx

+4-17
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ cimport cython
77
import_array()
88

99
cdef float64_t FP_ERR = 1e-13
10-
cdef float64_t REL_TOL = 1e-07
1110

1211
cimport util
1312

@@ -136,18 +135,6 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,
136135
return result
137136

138137

139-
cdef inline bint float64_are_diff(float64_t left, float64_t right):
140-
cdef double abs_diff, allowed
141-
if right == MAXfloat64 or right == -MAXfloat64:
142-
if left == right:
143-
return False
144-
else:
145-
return True
146-
else:
147-
abs_diff = fabs(left - right)
148-
allowed = REL_TOL * fabs(right)
149-
return abs_diff > allowed
150-
151138
def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
152139
na_option='keep', pct=False):
153140
"""
@@ -202,7 +189,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
202189
ranks[argsorted[i]] = nan
203190
continue
204191
count += 1.0
205-
if i == n - 1 or float64_are_diff(sorted_data[i + 1], val):
192+
if i == n - 1 or sorted_data[i + 1] != val:
206193
if tiebreak == TIEBREAK_AVERAGE:
207194
for j in range(i - dups + 1, i + 1):
208195
ranks[argsorted[j]] = sum_ranks / dups
@@ -361,7 +348,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
361348
ranks[i, argsorted[i, j]] = nan
362349
continue
363350
count += 1.0
364-
if j == k - 1 or float64_are_diff(values[i, j + 1], val):
351+
if j == k - 1 or values[i, j + 1] != val:
365352
if tiebreak == TIEBREAK_AVERAGE:
366353
for z in range(j - dups + 1, j + 1):
367354
ranks[i, argsorted[i, z]] = sum_ranks / dups
@@ -1087,7 +1074,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
10871074
sum_wt = 1.
10881075
sum_wt2 = 1.
10891076
old_wt = 1.
1090-
1077+
10911078
for i from 1 <= i < N:
10921079
cur_x = input_x[i]
10931080
cur_y = input_y[i]
@@ -1117,7 +1104,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
11171104
elif is_observation:
11181105
mean_x = cur_x
11191106
mean_y = cur_y
1120-
1107+
11211108
if nobs >= minp:
11221109
if not bias:
11231110
numerator = sum_wt * sum_wt

pandas/tests/test_series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4730,7 +4730,7 @@ def test_rank(self):
47304730
assert_series_equal(iranks, exp)
47314731

47324732
iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20+1e-30, 1e-1])
4733-
exp = Series([2, 1, 3.5, 5, 3.5, 6])
4733+
exp = Series([2, 1, 3, 5, 4, 6.0])
47344734
iranks = iseries.rank()
47354735
assert_series_equal(iranks, exp)
47364736

pandas/tests/test_stats.py

+37
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,43 @@ def _check(s, expected, method='average'):
4444
series = s if dtype is None else s.astype(dtype)
4545
_check(series, results[method], method=method)
4646

47+
def test_rank_methods_series(self):
48+
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
49+
from scipy.stats import rankdata
50+
51+
xs = np.random.randn(9)
52+
xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates
53+
np.random.shuffle(xs)
54+
55+
index = [chr(ord('a') + i) for i in range(len(xs))]
56+
57+
for vals in [xs, xs + 1e6, xs * 1e-6]:
58+
ts = Series(vals, index=index)
59+
60+
for m in ['average', 'min', 'max', 'first', 'dense']:
61+
result = ts.rank(m)
62+
sprank = rankdata(vals, m if m != 'first' else 'ordinal')
63+
tm.assert_series_equal(result, Series(sprank, index=index))
64+
65+
def test_rank_methods_frame(self):
66+
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
67+
from scipy.stats import rankdata
68+
69+
xs = np.random.randint(0, 21, (100, 26))
70+
xs = (xs - 10.0) / 10.0
71+
cols = [chr(ord('z') - i) for i in range(xs.shape[1])]
72+
73+
for vals in [xs, xs + 1e6, xs * 1e-6]:
74+
df = DataFrame(vals, columns=cols)
75+
76+
for ax in [0, 1]:
77+
for m in ['average', 'min', 'max', 'first', 'dense']:
78+
result = df.rank(axis=ax, method=m)
79+
sprank = np.apply_along_axis(rankdata, ax, vals,
80+
m if m != 'first' else 'ordinal')
81+
expected = DataFrame(sprank, columns=cols)
82+
tm.assert_frame_equal(result, expected)
83+
4784
def test_rank_dense_method(self):
4885
dtypes = ['O', 'f8', 'i8']
4986
in_out = [([1], [1]),

0 commit comments

Comments
 (0)