Skip to content

Commit 3f9db65

Browse files
jschendeltm9k1
authored andcommitted
BUG: Fix Series/DataFrame.rank(pct=True) with more than 2**24 rows (pandas-dev#23688)
1 parent e264240 commit 3f9db65

File tree

5 files changed

+26
-2
lines changed

5 files changed

+26
-2
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1206,6 +1206,7 @@ Numeric
12061206
- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`)
12071207
- Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`).
12081208
- :meth:`Series.agg` can now handle numpy NaN-aware methods like :func:`numpy.nansum` (:issue:`19629`)
1209+
- Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``pct=True`` and more than 2:sup:`24` rows are present resulted in percentages greater than 1.0 (:issue:`18271`)
12091210

12101211
Strings
12111212
^^^^^^^

pandas/_libs/algos_rank_helper.pxi.in

+2-2
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
5353
int tiebreak = 0
5454
bint keep_na = 0
5555
bint isnan
56-
float count = 0.0
56+
float64_t count = 0.0
5757
tiebreak = tiebreakers[ties_method]
5858

5959
{{if dtype == 'float64'}}
@@ -228,7 +228,7 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
228228
float64_t sum_ranks = 0
229229
int tiebreak = 0
230230
bint keep_na = 0
231-
float count = 0.0
231+
float64_t count = 0.0
232232

233233
tiebreak = tiebreakers[ties_method]
234234

pandas/tests/frame/test_rank.py

+7
Original file line numberDiff line numberDiff line change
@@ -309,3 +309,10 @@ def test_rank_pct_true(self, method, exp):
309309

310310
expected = DataFrame(exp)
311311
tm.assert_frame_equal(result, expected)
312+
313+
def test_pct_max_many_rows(self):
314+
# GH 18271
315+
df = DataFrame({'A': np.arange(2**24 + 1),
316+
'B': np.arange(2**24 + 1, 0, -1)})
317+
result = df.rank(pct=True).max()
318+
assert (result == 1).all()

pandas/tests/series/test_rank.py

+7
Original file line numberDiff line numberDiff line change
@@ -495,3 +495,10 @@ def test_rank_first_pct(dtype, ser, exp):
495495
result = s.rank(method='first', pct=True)
496496
expected = Series(exp).astype(result.dtype)
497497
assert_series_equal(result, expected)
498+
499+
500+
def test_pct_max_many_rows():
501+
# GH 18271
502+
s = Series(np.arange(2**24 + 1))
503+
result = s.rank(pct=True).max()
504+
assert result == 1

pandas/tests/test_algos.py

+9
Original file line numberDiff line numberDiff line change
@@ -1462,6 +1462,15 @@ def test_too_many_ndims(self):
14621462
with pytest.raises(TypeError, match=msg):
14631463
algos.rank(arr)
14641464

1465+
@pytest.mark.parametrize('values', [
1466+
np.arange(2**24 + 1),
1467+
np.arange(2**25 + 2).reshape(2**24 + 1, 2)],
1468+
ids=['1d', '2d'])
1469+
def test_pct_max_many_rows(self, values):
1470+
# GH 18271
1471+
result = algos.rank(values, pct=True).max()
1472+
assert result == 1
1473+
14651474

14661475
def test_pad_backfill_object_segfault():
14671476

0 commit comments

Comments
 (0)