Skip to content

Series.rank() doesn't handle small floats correctly #6886

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,7 @@ Bug Fixes
- Bug in `DataFrame.plot` and `Series.plot` legend behave inconsistently when plotting to the same axes repeatedly (:issue:`6678`)
- Internal tests for patching ``__finalize__`` / bug in merge not finalizing (:issue:`6923`, :issue:`6927`)
- accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`)
- Bug in ``Series.rank`` and ``DataFrame.rank`` that caused small floats (<1e-13) to all receive the same rank (:issue:`6886`)

pandas 0.13.1
-------------
Expand Down
17 changes: 15 additions & 2 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ cimport cython
import_array()

cdef float64_t FP_ERR = 1e-13
cdef float64_t REL_TOL = 1e-07

cimport util

Expand Down Expand Up @@ -132,6 +133,18 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,
return result


cdef inline bint float64_are_diff(float64_t left, float64_t right):
cdef double abs_diff, allowed
if right == MAXfloat64 or right == -MAXfloat64:
if left == right:
return False
else:
return True
else:
abs_diff = fabs(left - right)
allowed = REL_TOL * fabs(right)
return abs_diff > allowed

def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
na_option='keep', pct=False):
"""
Expand Down Expand Up @@ -186,7 +199,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
ranks[argsorted[i]] = nan
continue
count += 1.0
if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR:
if i == n - 1 or float64_are_diff(sorted_data[i + 1], val):
if tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
Expand Down Expand Up @@ -345,7 +358,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
ranks[i, argsorted[i, j]] = nan
continue
count += 1.0
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
if j == k - 1 or float64_are_diff(values[i, j + 1], val):
if tiebreak == TIEBREAK_AVERAGE:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = sum_ranks / dups
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11031,6 +11031,7 @@ def test_rank(self):
exp = df.astype(float).rank(1)
assert_frame_equal(result, exp)


def test_rank2(self):
from datetime import datetime
df = DataFrame([[1, 3, 2], [1, 2, 3]])
Expand Down Expand Up @@ -11084,6 +11085,11 @@ def test_rank2(self):
expected = self.mixed_frame.rank(1, numeric_only=True)
assert_frame_equal(result, expected)

df = DataFrame({"a":[1e-20, -5, 1e-20+1e-40, 10, 1e60, 1e80, 1e-30]})
exp = DataFrame({"a":[ 3.5, 1. , 3.5, 5. , 6. , 7. , 2. ]})
assert_frame_equal(df.rank(), exp)


def test_rank_na_option(self):
from pandas.compat.scipy import rankdata

Expand Down
23 changes: 22 additions & 1 deletion pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4047,14 +4047,35 @@ def test_rank(self):
exp = iseries / 4.0
iranks = iseries.rank(pct=True)
assert_series_equal(iranks, exp)
rng = date_range('1/1/1990', periods=5)

rng = date_range('1/1/1990', periods=5)
iseries = Series(np.arange(5), rng) + 1
iseries.ix[4] = np.nan
exp = iseries / 4.0
iranks = iseries.rank(pct=True)
assert_series_equal(iranks, exp)

iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20+1e-30, 1e-1])
exp = Series([2, 1, 3.5, 5, 3.5, 6])
iranks = iseries.rank()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one of these tests is failing because it is converting to int64, not sure which one, or where than happens, can you check out (I thought it should always be float)

assert_series_equal(iranks, exp)

values = np.array([-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], dtype='float64')
random_order = np.random.permutation(len(values))
iseries = Series(values[random_order])
exp = Series(random_order + 1.0, dtype='float64')
iranks = iseries.rank()
assert_series_equal(iranks, exp)

def test_rank_inf(self):
raise nose.SkipTest('DataFrame.rank does not currently rank np.inf and -np.inf properly')

values = np.array([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf], dtype='float64')
random_order = np.random.permutation(len(values))
iseries = Series(values[random_order])
exp = Series(random_order + 1.0, dtype='float64')
iranks = iseries.rank()
assert_series_equal(iranks, exp)


def test_from_csv(self):
Expand Down