BUG: Dense ranking with percent now uses 100% basis (pandas-dev#15639)

rouzazari · jreback · commit 1d73cf33b4ac · 2018-03-08T19:54:53.000-05:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -907,6 +907,7 @@ Offsets
 
 Numeric
 ^^^^^^^
+- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`)
 - Bug in :class:`Series` constructor with an int or float list where specifying ``dtype=str``, ``dtype='str'`` or ``dtype='U'`` failed to convert the data elements to strings (:issue:`16605`)
 - Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`)
 - Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`)
diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in
@@ -213,7 +213,10 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
                 sum_ranks = dups = 0
     {{endif}}
     if pct:
-        return ranks / count
+        if tiebreak == TIEBREAK_DENSE:
+            return ranks / total_tie_count
+        else:
+            return ranks / count
     else:
         return ranks
 
@@ -385,7 +388,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
                         ranks[i, argsorted[i, z]] = total_tie_count
                 sum_ranks = dups = 0
         if pct:
-            ranks[i, :] /= count
+            if tiebreak == TIEBREAK_DENSE:
+                ranks[i, :] /= total_tie_count
+            else:
+                ranks[i, :] /= count
     if axis == 0:
         return ranks.T
     else:
diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py
@@ -1,16 +1,16 @@
 # -*- coding: utf-8 -*-
 import pytest
-from datetime import timedelta, datetime
-from distutils.version import LooseVersion
-from numpy import nan
 import numpy as np
+import pandas.util.testing as tm
 
-from pandas import Series, DataFrame
+from distutils.version import LooseVersion
+from datetime import timedelta, datetime
+from numpy import nan
 
-from pandas.compat import product
 from pandas.util.testing import assert_frame_equal
-import pandas.util.testing as tm
 from pandas.tests.frame.common import TestData
+from pandas import Series, DataFrame
+from pandas.compat import product
 
 
 class TestRank(TestData):
@@ -266,3 +266,34 @@ def _check2d(df, expected, method='average', axis=0):
                 continue
             frame = df if dtype is None else df.astype(dtype)
             _check2d(frame, results[method], method=method, axis=axis)
+
+
+@pytest.mark.parametrize(
+    "method,exp", [("dense",
+                    [[1., 1., 1.],
+                     [1., 0.5, 2. / 3],
+                     [1., 0.5, 1. / 3]]),
+                   ("min",
+                    [[1. / 3, 1., 1.],
+                     [1. / 3, 1. / 3, 2. / 3],
+                     [1. / 3, 1. / 3, 1. / 3]]),
+                   ("max",
+                    [[1., 1., 1.],
+                     [1., 2. / 3, 2. / 3],
+                     [1., 2. / 3, 1. / 3]]),
+                   ("average",
+                    [[2. / 3, 1., 1.],
+                     [2. / 3, 0.5, 2. / 3],
+                     [2. / 3, 0.5, 1. / 3]]),
+                   ("first",
+                    [[1. / 3, 1., 1.],
+                     [2. / 3, 1. / 3, 2. / 3],
+                     [3. / 3, 2. / 3, 1. / 3]])])
+def test_rank_pct_true(method, exp):
+    # see gh-15630.
+
+    df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
+    result = df.rank(method=method, pct=True)
+
+    expected = DataFrame(exp)
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py
@@ -376,3 +376,96 @@ def test_rank_modify_inplace(self):
         s.rank()
         result = s
         assert_series_equal(result, expected)
+
+
+# GH15630, pct should be on 100% basis when method='dense'
+
+@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
+@pytest.mark.parametrize('ser, exp', [
+    ([1], [1.]),
+    ([1, 2], [1. / 2, 2. / 2]),
+    ([2, 2], [1., 1.]),
+    ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
+    ([1, 2, 2], [1. / 2, 2. / 2, 2. / 2]),
+    ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
+    ([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
+    ([1, 1, 3, 3, 5, 5], [1. / 3, 1. / 3, 2. / 3, 2. / 3, 3. / 3, 3. / 3]),
+    ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
+def test_rank_dense_pct(dtype, ser, exp):
+        s = Series(ser).astype(dtype)
+        result = s.rank(method='dense', pct=True)
+        expected = Series(exp).astype(result.dtype)
+        assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
+@pytest.mark.parametrize('ser, exp', [
+    ([1], [1.]),
+    ([1, 2], [1. / 2, 2. / 2]),
+    ([2, 2], [1. / 2, 1. / 2]),
+    ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
+    ([1, 2, 2], [1. / 3, 2. / 3, 2. / 3]),
+    ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
+    ([1, 1, 5, 5, 3], [1. / 5, 1. / 5, 4. / 5, 4. / 5, 3. / 5]),
+    ([1, 1, 3, 3, 5, 5], [1. / 6, 1. / 6, 3. / 6, 3. / 6, 5. / 6, 5. / 6]),
+    ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
+def test_rank_min_pct(dtype, ser, exp):
+        s = Series(ser).astype(dtype)
+        result = s.rank(method='min', pct=True)
+        expected = Series(exp).astype(result.dtype)
+        assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
+@pytest.mark.parametrize('ser, exp', [
+    ([1], [1.]),
+    ([1, 2], [1. / 2, 2. / 2]),
+    ([2, 2], [1., 1.]),
+    ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
+    ([1, 2, 2], [1. / 3, 3. / 3, 3. / 3]),
+    ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
+    ([1, 1, 5, 5, 3], [2. / 5, 2. / 5, 5. / 5, 5. / 5, 3. / 5]),
+    ([1, 1, 3, 3, 5, 5], [2. / 6, 2. / 6, 4. / 6, 4. / 6, 6. / 6, 6. / 6]),
+    ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
+def test_rank_max_pct(dtype, ser, exp):
+        s = Series(ser).astype(dtype)
+        result = s.rank(method='max', pct=True)
+        expected = Series(exp).astype(result.dtype)
+        assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
+@pytest.mark.parametrize('ser, exp', [
+    ([1], [1.]),
+    ([1, 2], [1. / 2, 2. / 2]),
+    ([2, 2], [1.5 / 2, 1.5 / 2]),
+    ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
+    ([1, 2, 2], [1. / 3, 2.5 / 3, 2.5 / 3]),
+    ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
+    ([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3. / 5]),
+    ([1, 1, 3, 3, 5, 5],
+     [1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]),
+    ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
+def test_rank_average_pct(dtype, ser, exp):
+        s = Series(ser).astype(dtype)
+        result = s.rank(method='average', pct=True)
+        expected = Series(exp).astype(result.dtype)
+        assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize('dtype', ['f8', 'i8'])
+@pytest.mark.parametrize('ser, exp', [
+    ([1], [1.]),
+    ([1, 2], [1. / 2, 2. / 2]),
+    ([2, 2], [1. / 2, 2. / 2.]),
+    ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
+    ([1, 2, 2], [1. / 3, 2. / 3, 3. / 3]),
+    ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
+    ([1, 1, 5, 5, 3], [1. / 5, 2. / 5, 4. / 5, 5. / 5, 3. / 5]),
+    ([1, 1, 3, 3, 5, 5], [1. / 6, 2. / 6, 3. / 6, 4. / 6, 5. / 6, 6. / 6]),
+    ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
+def test_rank_first_pct(dtype, ser, exp):
+        s = Series(ser).astype(dtype)
+        result = s.rank(method='first', pct=True)
+        expected = Series(exp).astype(result.dtype)
+        assert_series_equal(result, expected)