BUG: Fix Series/DataFrame.rank(pct=True) with more than 2**24 rows (pandas-dev#23688)

jschendel · tm9k1 · commit 3f9db65fa932 · 2018-11-20T02:37:30.000+05:30
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -1206,6 +1206,7 @@ Numeric
 - Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`)
 - Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`).
 - :meth:`Series.agg` can now handle numpy NaN-aware methods like :func:`numpy.nansum` (:issue:`19629`)
+- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``pct=True`` and more than 2:sup:`24` rows are present resulted in percentages greater than 1.0 (:issue:`18271`)
 
 Strings
 ^^^^^^^
diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in
@@ -53,7 +53,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
         int tiebreak = 0
         bint keep_na = 0
         bint isnan
-        float count = 0.0
+        float64_t count = 0.0
     tiebreak = tiebreakers[ties_method]
 
     {{if dtype == 'float64'}}
@@ -228,7 +228,7 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
         float64_t sum_ranks = 0
         int tiebreak = 0
         bint keep_na = 0
-        float count = 0.0
+        float64_t count = 0.0
 
     tiebreak = tiebreakers[ties_method]
 
diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py
@@ -309,3 +309,10 @@ def test_rank_pct_true(self, method, exp):
 
         expected = DataFrame(exp)
         tm.assert_frame_equal(result, expected)
+
+    def test_pct_max_many_rows(self):
+        # GH 18271
+        df = DataFrame({'A': np.arange(2**24 + 1),
+                        'B': np.arange(2**24 + 1, 0, -1)})
+        result = df.rank(pct=True).max()
+        assert (result == 1).all()
diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py
@@ -495,3 +495,10 @@ def test_rank_first_pct(dtype, ser, exp):
         result = s.rank(method='first', pct=True)
         expected = Series(exp).astype(result.dtype)
         assert_series_equal(result, expected)
+
+
+def test_pct_max_many_rows():
+        # GH 18271
+        s = Series(np.arange(2**24 + 1))
+        result = s.rank(pct=True).max()
+        assert result == 1
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -1462,6 +1462,15 @@ def test_too_many_ndims(self):
         with pytest.raises(TypeError, match=msg):
             algos.rank(arr)
 
+    @pytest.mark.parametrize('values', [
+        np.arange(2**24 + 1),
+        np.arange(2**25 + 2).reshape(2**24 + 1, 2)],
+        ids=['1d', '2d'])
+    def test_pct_max_many_rows(self, values):
+        # GH 18271
+        result = algos.rank(values, pct=True).max()
+        assert result == 1
+
 
 def test_pad_backfill_object_segfault():