Skip to content

Commit d96fd80

Browse files
Noah Spiesjreback
Noah Spies
authored andcommitted
BUG: Series/DataFrame.rank() doesn't handle small floats correctly #6868
adding test for ranking with np.inf Added release note #6886 Fixing float conversions in test_rank()
1 parent 7168d98 commit d96fd80

File tree

4 files changed

+44
-3
lines changed

4 files changed

+44
-3
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,7 @@ Bug Fixes
418418
parser when no options are ignored (:issue:`6607`)
419419
- Bug in C parser with leading whitespace (:issue:`3374`)
420420
- Bug in C parser with ``delim_whitespace=True`` and ``\r``-delimited lines
421+
- Bug in ``Series.rank`` and ``DataFrame.rank`` that caused small floats (<1e-13) to all receive the same rank (:issue:`6886`)
421422

422423
pandas 0.13.1
423424
-------------

pandas/algos.pyx

+15-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ cimport cython
77
import_array()
88

99
cdef float64_t FP_ERR = 1e-13
10+
cdef float64_t REL_TOL = 1e-07
1011

1112
cimport util
1213

@@ -132,6 +133,18 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,
132133
return result
133134

134135

136+
cdef inline bint float64_are_diff(float64_t left, float64_t right):
137+
cdef double abs_diff, allowed
138+
if right == MAXfloat64 or right == -MAXfloat64:
139+
if left == right:
140+
return False
141+
else:
142+
return True
143+
else:
144+
abs_diff = fabs(left - right)
145+
allowed = REL_TOL * fabs(right)
146+
return abs_diff > allowed
147+
135148
def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
136149
na_option='keep', pct=False):
137150
"""
@@ -186,7 +199,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
186199
ranks[argsorted[i]] = nan
187200
continue
188201
count += 1.0
189-
if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR:
202+
if i == n - 1 or float64_are_diff(sorted_data[i + 1], val):
190203
if tiebreak == TIEBREAK_AVERAGE:
191204
for j in range(i - dups + 1, i + 1):
192205
ranks[argsorted[j]] = sum_ranks / dups
@@ -345,7 +358,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
345358
ranks[i, argsorted[i, j]] = nan
346359
continue
347360
count += 1.0
348-
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
361+
if j == k - 1 or float64_are_diff(values[i, j + 1], val):
349362
if tiebreak == TIEBREAK_AVERAGE:
350363
for z in range(j - dups + 1, j + 1):
351364
ranks[i, argsorted[i, z]] = sum_ranks / dups

pandas/tests/test_frame.py

+6
Original file line numberDiff line numberDiff line change
@@ -11031,6 +11031,7 @@ def test_rank(self):
1103111031
exp = df.astype(float).rank(1)
1103211032
assert_frame_equal(result, exp)
1103311033

11034+
1103411035
def test_rank2(self):
1103511036
from datetime import datetime
1103611037
df = DataFrame([[1, 3, 2], [1, 2, 3]])
@@ -11084,6 +11085,11 @@ def test_rank2(self):
1108411085
expected = self.mixed_frame.rank(1, numeric_only=True)
1108511086
assert_frame_equal(result, expected)
1108611087

11088+
df = DataFrame({"a":[1e-20, -5, 1e-20+1e-40, 10, 1e60, 1e80, 1e-30]})
11089+
exp = DataFrame({"a":[ 3.5, 1. , 3.5, 5. , 6. , 7. , 2. ]})
11090+
assert_frame_equal(df.rank(), exp)
11091+
11092+
1108711093
def test_rank_na_option(self):
1108811094
from pandas.compat.scipy import rankdata
1108911095

pandas/tests/test_series.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -4047,14 +4047,35 @@ def test_rank(self):
40474047
exp = iseries / 4.0
40484048
iranks = iseries.rank(pct=True)
40494049
assert_series_equal(iranks, exp)
4050-
rng = date_range('1/1/1990', periods=5)
40514050

4051+
rng = date_range('1/1/1990', periods=5)
40524052
iseries = Series(np.arange(5), rng) + 1
40534053
iseries.ix[4] = np.nan
40544054
exp = iseries / 4.0
40554055
iranks = iseries.rank(pct=True)
40564056
assert_series_equal(iranks, exp)
40574057

4058+
iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20+1e-30, 1e-1])
4059+
exp = Series([2, 1, 3.5, 5, 3.5, 6])
4060+
iranks = iseries.rank()
4061+
assert_series_equal(iranks, exp)
4062+
4063+
values = np.array([-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], dtype='float64')
4064+
random_order = np.random.permutation(len(values))
4065+
iseries = Series(values[random_order])
4066+
exp = Series(random_order + 1.0, dtype='float64')
4067+
iranks = iseries.rank()
4068+
assert_series_equal(iranks, exp)
4069+
4070+
def test_rank_inf(self):
4071+
raise nose.SkipTest('DataFrame.rank does not currently rank np.inf and -np.inf properly')
4072+
4073+
values = np.array([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf], dtype='float64')
4074+
random_order = np.random.permutation(len(values))
4075+
iseries = Series(values[random_order])
4076+
exp = Series(random_order + 1.0, dtype='float64')
4077+
iranks = iseries.rank()
4078+
assert_series_equal(iranks, exp)
40584079

40594080

40604081
def test_from_csv(self):

0 commit comments

Comments
 (0)