Merge pull request pandas-dev#8379 from behzadnouri/rank-tol

jreback · jreback · commit 0d35dd4cb88b · 2015-03-02T20:15:35.000-05:00
BUG: floats cannot be ranked with tolerance
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -349,6 +349,7 @@ Bug Fixes
 - Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`).
 - Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`).
 - Bug in ``unstack`` with ``TimedeltaIndex`` or ``DatetimeIndex`` and nulls (:issue:`9491`).
+- Bug in ``rank`` where comparing floats with tolerance will cause inconsistent behaviour (:issue:`8365`).
 
 
 - Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`).
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -7,7 +7,6 @@ cimport cython
 import_array()
 
 cdef float64_t FP_ERR = 1e-13
-cdef float64_t REL_TOL = 1e-07
 
 cimport util
 
@@ -136,18 +135,6 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,
     return result
 
 
-cdef inline bint float64_are_diff(float64_t left, float64_t right):
-    cdef double abs_diff, allowed
-    if right == MAXfloat64 or right == -MAXfloat64:
-        if left == right:
-            return False
-        else:
-            return True
-    else:
-        abs_diff = fabs(left - right)
-        allowed = REL_TOL * fabs(right)
-        return abs_diff > allowed
-
 def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
                     na_option='keep', pct=False):
     """
@@ -202,7 +189,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
             ranks[argsorted[i]] = nan
             continue
         count += 1.0
-        if i == n - 1 or float64_are_diff(sorted_data[i + 1], val):
+        if i == n - 1 or sorted_data[i + 1] != val:
             if tiebreak == TIEBREAK_AVERAGE:
                 for j in range(i - dups + 1, i + 1):
                     ranks[argsorted[j]] = sum_ranks / dups
@@ -361,7 +348,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
                 ranks[i, argsorted[i, j]] = nan
                 continue
             count += 1.0
-            if j == k - 1 or float64_are_diff(values[i, j + 1], val):
+            if j == k - 1 or values[i, j + 1] != val:
                 if tiebreak == TIEBREAK_AVERAGE:
                     for z in range(j - dups + 1, j + 1):
                         ranks[i, argsorted[i, z]] = sum_ranks / dups
@@ -1087,7 +1074,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
     sum_wt = 1.
     sum_wt2 = 1.
     old_wt = 1.
-    
+
     for i from 1 <= i < N:
         cur_x = input_x[i]
         cur_y = input_y[i]
@@ -1117,7 +1104,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
         elif is_observation:
             mean_x = cur_x
             mean_y = cur_y
-        
+
         if nobs >= minp:
             if not bias:
                 numerator = sum_wt * sum_wt
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -4730,7 +4730,7 @@ def test_rank(self):
         assert_series_equal(iranks, exp)
 
         iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20+1e-30, 1e-1])
-        exp = Series([2, 1, 3.5, 5, 3.5, 6])
+        exp = Series([2, 1, 3, 5, 4, 6.0])
         iranks = iseries.rank()
         assert_series_equal(iranks, exp)
 
diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py
@@ -44,6 +44,43 @@ def _check(s, expected, method='average'):
             series = s if dtype is None else s.astype(dtype)
             _check(series, results[method], method=method)
 
+    def test_rank_methods_series(self):
+        tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
+        from scipy.stats import rankdata
+
+        xs = np.random.randn(9)
+        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates
+        np.random.shuffle(xs)
+
+        index = [chr(ord('a') + i) for i in range(len(xs))]
+
+        for vals in [xs, xs + 1e6, xs * 1e-6]:
+            ts = Series(vals, index=index)
+
+            for m in ['average', 'min', 'max', 'first', 'dense']:
+                result = ts.rank(m)
+                sprank = rankdata(vals, m if m != 'first' else 'ordinal')
+                tm.assert_series_equal(result, Series(sprank, index=index))
+
+    def test_rank_methods_frame(self):
+        tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
+        from scipy.stats import rankdata
+
+        xs = np.random.randint(0, 21, (100, 26))
+        xs = (xs - 10.0) / 10.0
+        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]
+
+        for vals in [xs, xs + 1e6, xs * 1e-6]:
+            df = DataFrame(vals, columns=cols)
+
+            for ax in [0, 1]:
+                for m in ['average', 'min', 'max', 'first', 'dense']:
+                    result = df.rank(axis=ax, method=m)
+                    sprank = np.apply_along_axis(rankdata, ax, vals,
+                                      m if m != 'first' else 'ordinal')
+                    expected = DataFrame(sprank, columns=cols)
+                    tm.assert_frame_equal(result, expected)
+
     def test_rank_dense_method(self):
         dtypes = ['O', 'f8', 'i8']
         in_out = [([1], [1]),