Skip to content

Commit a59987b

Browse files
authored
BUG: rank treating min int as NaN (#40659)
1 parent a85cccd commit a59987b

File tree

6 files changed

+58
-19
lines changed

6 files changed

+58
-19
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,7 @@ Numeric
533533
- Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
534534
- Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`)
535535
- Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`)
536+
- Bug in ``rank`` method for :class:`Series`, :class:`DataFrame`, :class:`DataFrameGroupBy`, and :class:`SeriesGroupBy` treating the most negative ``int64`` value as missing (:issue:`32859`)
536537
- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
537538
- Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
538539
- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)

pandas/_libs/algos.pyx

+19-7
Original file line numberDiff line numberDiff line change
@@ -962,6 +962,7 @@ ctypedef fused rank_t:
962962
def rank_1d(
963963
ndarray[rank_t, ndim=1] values,
964964
const intp_t[:] labels,
965+
bint is_datetimelike=False,
965966
ties_method="average",
966967
bint ascending=True,
967968
bint pct=False,
@@ -977,6 +978,8 @@ def rank_1d(
977978
Array containing unique label for each group, with its ordering
978979
matching up to the corresponding record in `values`. If not called
979980
from a groupby operation, will be an array of 0's
981+
is_datetimelike : bool, default False
982+
True if `values` contains datetime-like entries.
980983
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
981984
'average'
982985
* average: average rank of group
@@ -1032,7 +1035,7 @@ def rank_1d(
10321035

10331036
if rank_t is object:
10341037
mask = missing.isnaobj(masked_vals)
1035-
elif rank_t is int64_t:
1038+
elif rank_t is int64_t and is_datetimelike:
10361039
mask = (masked_vals == NPY_NAT).astype(np.uint8)
10371040
elif rank_t is float64_t:
10381041
mask = np.isnan(masked_vals).astype(np.uint8)
@@ -1059,7 +1062,7 @@ def rank_1d(
10591062
if rank_t is object:
10601063
nan_fill_val = NegInfinity()
10611064
elif rank_t is int64_t:
1062-
nan_fill_val = np.iinfo(np.int64).min
1065+
nan_fill_val = NPY_NAT
10631066
elif rank_t is uint64_t:
10641067
nan_fill_val = 0
10651068
else:
@@ -1275,6 +1278,7 @@ def rank_1d(
12751278
def rank_2d(
12761279
ndarray[rank_t, ndim=2] in_arr,
12771280
int axis=0,
1281+
bint is_datetimelike=False,
12781282
ties_method="average",
12791283
bint ascending=True,
12801284
na_option="keep",
@@ -1299,7 +1303,9 @@ def rank_2d(
12991303
tiebreak = tiebreakers[ties_method]
13001304

13011305
keep_na = na_option == 'keep'
1302-
check_mask = rank_t is not uint64_t
1306+
1307+
# For cases where a mask is not possible, we can avoid mask checks
1308+
check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
13031309

13041310
if axis == 0:
13051311
values = np.asarray(in_arr).T.copy()
@@ -1310,28 +1316,34 @@ def rank_2d(
13101316
if values.dtype != np.object_:
13111317
values = values.astype('O')
13121318

1313-
if rank_t is not uint64_t:
1319+
if check_mask:
13141320
if ascending ^ (na_option == 'top'):
13151321
if rank_t is object:
13161322
nan_value = Infinity()
13171323
elif rank_t is float64_t:
13181324
nan_value = np.inf
1319-
elif rank_t is int64_t:
1325+
1326+
# int64 and datetimelike
1327+
else:
13201328
nan_value = np.iinfo(np.int64).max
13211329

13221330
else:
13231331
if rank_t is object:
13241332
nan_value = NegInfinity()
13251333
elif rank_t is float64_t:
13261334
nan_value = -np.inf
1327-
elif rank_t is int64_t:
1335+
1336+
# int64 and datetimelike
1337+
else:
13281338
nan_value = NPY_NAT
13291339

13301340
if rank_t is object:
13311341
mask = missing.isnaobj2d(values)
13321342
elif rank_t is float64_t:
13331343
mask = np.isnan(values)
1334-
elif rank_t is int64_t:
1344+
1345+
# int64 and datetimelike
1346+
else:
13351347
mask = values == NPY_NAT
13361348

13371349
np.putmask(values, mask, nan_value)

pandas/_libs/groupby.pyx

+3-3
Original file line numberDiff line numberDiff line change
@@ -1074,9 +1074,8 @@ def group_rank(float64_t[:, ::1] out,
10741074
ngroups : int
10751075
This parameter is not used, is needed to match signatures of other
10761076
groupby functions.
1077-
is_datetimelike : bool, default False
1078-
unused in this method but provided for call compatibility with other
1079-
Cython transformations
1077+
is_datetimelike : bool
1078+
True if `values` contains datetime-like entries.
10801079
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
10811080
'average'
10821081
* average: average rank of group
@@ -1104,6 +1103,7 @@ def group_rank(float64_t[:, ::1] out,
11041103
result = rank_1d(
11051104
values=values[:, 0],
11061105
labels=labels,
1106+
is_datetimelike=is_datetimelike,
11071107
ties_method=ties_method,
11081108
ascending=ascending,
11091109
pct=pct,

pandas/core/algorithms.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1031,21 +1031,23 @@ def rank(
10311031
Whether or not to the display the returned rankings in integer form
10321032
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
10331033
"""
1034+
is_datetimelike = needs_i8_conversion(values.dtype)
1035+
values = _get_values_for_rank(values)
10341036
if values.ndim == 1:
1035-
values = _get_values_for_rank(values)
10361037
ranks = algos.rank_1d(
10371038
values,
10381039
labels=np.zeros(len(values), dtype=np.intp),
1040+
is_datetimelike=is_datetimelike,
10391041
ties_method=method,
10401042
ascending=ascending,
10411043
na_option=na_option,
10421044
pct=pct,
10431045
)
10441046
elif values.ndim == 2:
1045-
values = _get_values_for_rank(values)
10461047
ranks = algos.rank_2d(
10471048
values,
10481049
axis=axis,
1050+
is_datetimelike=is_datetimelike,
10491051
ties_method=method,
10501052
ascending=ascending,
10511053
na_option=na_option,

pandas/tests/frame/methods/test_rank.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import numpy as np
77
import pytest
88

9-
from pandas._libs import iNaT
109
from pandas._libs.algos import (
1110
Infinity,
1211
NegInfinity,
@@ -382,7 +381,7 @@ def test_pct_max_many_rows(self):
382381
"float32",
383382
),
384383
([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"),
385-
pytest.param(
384+
(
386385
[
387386
np.iinfo(np.int64).min,
388387
-100,
@@ -394,20 +393,20 @@ def test_pct_max_many_rows(self):
394393
np.iinfo(np.int64).max,
395394
],
396395
"int64",
397-
marks=pytest.mark.xfail(
398-
reason="iNaT is equivalent to minimum value of dtype"
399-
"int64 pending issue GH#16674"
400-
),
401396
),
402397
([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"),
398+
(
399+
[datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 5)],
400+
"datetime64",
401+
),
403402
],
404403
)
405404
def test_rank_inf_and_nan(self, contents, dtype, frame_or_series):
406405
dtype_na_map = {
407406
"float64": np.nan,
408407
"float32": np.nan,
409-
"int64": iNaT,
410408
"object": None,
409+
"datetime64": np.datetime64("nat"),
411410
}
412411
# Insert nans at random positions if underlying dtype has missing
413412
# value. Then adjust the expected order by adding nans accordingly

pandas/tests/groupby/test_rank.py

+25
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
from datetime import datetime
2+
13
import numpy as np
24
import pytest
35

46
import pandas as pd
57
from pandas import (
68
DataFrame,
9+
NaT,
710
Series,
811
concat,
912
)
@@ -517,3 +520,25 @@ def test_rank_zero_div(input_key, input_value, output_value):
517520
result = df.groupby("A").rank(method="dense", pct=True)
518521
expected = DataFrame({"B": output_value})
519522
tm.assert_frame_equal(result, expected)
523+
524+
525+
def test_rank_min_int():
526+
# GH-32859
527+
df = DataFrame(
528+
{
529+
"grp": [1, 1, 2],
530+
"int_col": [
531+
np.iinfo(np.int64).min,
532+
np.iinfo(np.int64).max,
533+
np.iinfo(np.int64).min,
534+
],
535+
"datetimelike": [NaT, datetime(2001, 1, 1), NaT],
536+
}
537+
)
538+
539+
result = df.groupby("grp").rank()
540+
expected = DataFrame(
541+
{"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.NaN, 1.0, np.NaN]}
542+
)
543+
544+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)