From 93662d5df91695e7ad9b13475ad13cb8d84e47f2 Mon Sep 17 00:00:00 2001 From: Noah Spies Date: Tue, 15 Apr 2014 08:28:59 -0700 Subject: [PATCH] BUG: Series/DataFrame.rank() doesn't handle small floats correctly #6868 cleaning up comments BUG: Series/DataFrame.rank() doesn't handle small floats correctly #6868 adding test for ranking with np.inf Added release note #6886 Fixing float conversions in test_rank() --- doc/source/release.rst | 1 + pandas/algos.pyx | 17 +++++++++++++++-- pandas/tests/test_frame.py | 6 ++++++ pandas/tests/test_series.py | 23 ++++++++++++++++++++++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 271daa1623a4b..248e18034c400 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -402,6 +402,7 @@ Bug Fixes - Bug in `DataFrame.plot` and `Series.plot` legend behave inconsistently when plotting to the same axes repeatedly (:issue:`6678`) - Internal tests for patching ``__finalize__`` / bug in merge not finalizing (:issue:`6923`, :issue:`6927`) - accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`) +- Bug in ``Series.rank`` and ``DataFrame.rank`` that caused small floats (<1e-13) to all receive the same rank (:issue:`6886`) pandas 0.13.1 ------------- diff --git a/pandas/algos.pyx b/pandas/algos.pyx index bba6b46c52e37..4628853df3953 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -7,6 +7,7 @@ cimport cython import_array() cdef float64_t FP_ERR = 1e-13 +cdef float64_t REL_TOL = 1e-07 cimport util @@ -132,6 +133,18 @@ cdef _take_2d_object(ndarray[object, ndim=2] values, return result +cdef inline bint float64_are_diff(float64_t left, float64_t right): + cdef double abs_diff, allowed + if right == MAXfloat64 or right == -MAXfloat64: + if left == right: + return False + else: + return True + else: + abs_diff = fabs(left - right) + allowed = REL_TOL * fabs(right) + return abs_diff > allowed + def rank_1d_float64(object in_arr, ties_method='average', ascending=True, na_option='keep', pct=False): """ @@ -186,7 +199,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, ranks[argsorted[i]] = nan continue count += 1.0 - if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR: + if i == n - 1 or float64_are_diff(sorted_data[i + 1], val): if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups @@ -345,7 +358,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', ranks[i, argsorted[i, j]] = nan continue count += 1.0 - if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: + if j == k - 1 or float64_are_diff(values[i, j + 1], val): if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = sum_ranks / dups diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a9e48c62f9693..178a73e3d5967 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11031,6 +11031,7 @@ def test_rank(self): exp = df.astype(float).rank(1) assert_frame_equal(result, exp) + def test_rank2(self): from datetime import datetime df = DataFrame([[1, 3, 2], [1, 2, 3]]) @@ -11084,6 +11085,11 @@ def test_rank2(self): expected = self.mixed_frame.rank(1, numeric_only=True) assert_frame_equal(result, expected) + df = DataFrame({"a":[1e-20, -5, 1e-20+1e-40, 10, 1e60, 1e80, 1e-30]}) + exp = DataFrame({"a":[ 3.5, 1. , 3.5, 5. , 6. , 7. , 2. ]}) + assert_frame_equal(df.rank(), exp) + + def test_rank_na_option(self): from pandas.compat.scipy import rankdata diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index d1775177d3c1d..d8eafc7cb8eab 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4047,14 +4047,35 @@ def test_rank(self): exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) - rng = date_range('1/1/1990', periods=5) + rng = date_range('1/1/1990', periods=5) iseries = Series(np.arange(5), rng) + 1 iseries.ix[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) + iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20+1e-30, 1e-1]) + exp = Series([2, 1, 3.5, 5, 3.5, 6]) + iranks = iseries.rank() + assert_series_equal(iranks, exp) + + values = np.array([-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], dtype='float64') + random_order = np.random.permutation(len(values)) + iseries = Series(values[random_order]) + exp = Series(random_order + 1.0, dtype='float64') + iranks = iseries.rank() + assert_series_equal(iranks, exp) + + def test_rank_inf(self): + raise nose.SkipTest('DataFrame.rank does not currently rank np.inf and -np.inf properly') + + values = np.array([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf], dtype='float64') + random_order = np.random.permutation(len(values)) + iseries = Series(values[random_order]) + exp = Series(random_order + 1.0, dtype='float64') + iranks = iseries.rank() + assert_series_equal(iranks, exp) def test_from_csv(self):