Skip to content

Commit 7a38d63

Browse files
authored
BUG/REF: use sorted_rank_1d for rank_2d (pandas-dev#41931)
1 parent 857a9b1 commit 7a38d63

File tree

4 files changed

+90
-129
lines changed

4 files changed

+90
-129
lines changed

doc/source/whatsnew/v1.4.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,8 @@ Timezones
137137

138138
Numeric
139139
^^^^^^^
140-
-
140+
- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`)
141+
- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
141142
-
142143

143144
Conversion

pandas/_libs/algos.pyx

+55-89
Original file line numberDiff line numberDiff line change
@@ -1372,26 +1372,29 @@ def rank_2d(
13721372
Fast NaN-friendly version of ``scipy.stats.rankdata``.
13731373
"""
13741374
cdef:
1375-
Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
1376-
Py_ssize_t infs
1377-
ndarray[float64_t, ndim=2] ranks
1375+
Py_ssize_t k, n, col
1376+
float64_t[::1, :] out # Column-major so columns are contiguous
1377+
int64_t[::1, :] grp_sizes
1378+
const intp_t[:] labels
13781379
ndarray[rank_t, ndim=2] values
1379-
ndarray[intp_t, ndim=2] argsort_indexer
1380-
ndarray[uint8_t, ndim=2] mask
1381-
rank_t val, nan_fill_val
1382-
float64_t count, sum_ranks = 0.0
1383-
int tiebreak = 0
1384-
int64_t idx
1385-
bint check_mask, condition, keep_na, nans_rank_highest
1380+
rank_t[:, :] masked_vals
1381+
intp_t[:, :] sort_indexer
1382+
uint8_t[:, :] mask
1383+
TiebreakEnumType tiebreak
1384+
bint check_mask, keep_na, nans_rank_highest
1385+
rank_t nan_fill_val
13861386

13871387
tiebreak = tiebreakers[ties_method]
1388+
if tiebreak == TIEBREAK_FIRST:
1389+
if not ascending:
1390+
tiebreak = TIEBREAK_FIRST_DESCENDING
13881391

13891392
keep_na = na_option == 'keep'
13901393

13911394
# For cases where a mask is not possible, we can avoid mask checks
13921395
check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
13931396

1394-
if axis == 0:
1397+
if axis == 1:
13951398
values = np.asarray(in_arr).T.copy()
13961399
else:
13971400
values = np.asarray(in_arr).copy()
@@ -1403,99 +1406,62 @@ def rank_2d(
14031406
nans_rank_highest = ascending ^ (na_option == 'top')
14041407
if check_mask:
14051408
nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
1409+
14061410
if rank_t is object:
1407-
mask = missing.isnaobj2d(values)
1411+
mask = missing.isnaobj2d(values).view(np.uint8)
14081412
elif rank_t is float64_t:
1409-
mask = np.isnan(values)
1413+
mask = np.isnan(values).view(np.uint8)
14101414

14111415
# int64 and datetimelike
14121416
else:
1413-
mask = values == NPY_NAT
1414-
1417+
mask = (values == NPY_NAT).view(np.uint8)
14151418
np.putmask(values, mask, nan_fill_val)
14161419
else:
1417-
mask = np.zeros_like(values, dtype=bool)
1420+
mask = np.zeros_like(values, dtype=np.uint8)
1421+
1422+
if nans_rank_highest:
1423+
order = (values, mask)
1424+
else:
1425+
order = (values, ~np.asarray(mask))
14181426

14191427
n, k = (<object>values).shape
1420-
ranks = np.empty((n, k), dtype='f8')
1428+
out = np.empty((n, k), dtype='f8', order='F')
1429+
grp_sizes = np.ones((n, k), dtype='i8', order='F')
1430+
labels = np.zeros(n, dtype=np.intp)
14211431

1422-
if tiebreak == TIEBREAK_FIRST:
1423-
# need to use a stable sort here
1424-
argsort_indexer = values.argsort(axis=1, kind='mergesort')
1425-
if not ascending:
1426-
tiebreak = TIEBREAK_FIRST_DESCENDING
1432+
# lexsort is slower, so only use if we need to worry about the mask
1433+
if check_mask:
1434+
sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
14271435
else:
1428-
argsort_indexer = values.argsort(1)
1436+
kind = "stable" if ties_method == "first" else None
1437+
sort_indexer = values.argsort(axis=0, kind=kind).astype(np.intp, copy=False)
14291438

14301439
if not ascending:
1431-
argsort_indexer = argsort_indexer[:, ::-1]
1432-
1433-
values = _take_2d(values, argsort_indexer)
1440+
sort_indexer = sort_indexer[::-1, :]
14341441

1435-
for i in range(n):
1436-
dups = sum_ranks = infs = 0
1437-
1438-
total_tie_count = 0
1439-
count = 0.0
1440-
for j in range(k):
1441-
val = values[i, j]
1442-
idx = argsort_indexer[i, j]
1443-
if keep_na and check_mask and mask[i, idx]:
1444-
ranks[i, idx] = NaN
1445-
infs += 1
1446-
continue
1447-
1448-
count += 1.0
1449-
1450-
sum_ranks += (j - infs) + 1
1451-
dups += 1
1452-
1453-
if rank_t is object:
1454-
condition = (
1455-
j == k - 1 or
1456-
are_diff(values[i, j + 1], val) or
1457-
(keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
1458-
)
1459-
else:
1460-
condition = (
1461-
j == k - 1 or
1462-
values[i, j + 1] != val or
1463-
(keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
1464-
)
1465-
1466-
if condition:
1467-
if tiebreak == TIEBREAK_AVERAGE:
1468-
for z in range(j - dups + 1, j + 1):
1469-
ranks[i, argsort_indexer[i, z]] = sum_ranks / dups
1470-
elif tiebreak == TIEBREAK_MIN:
1471-
for z in range(j - dups + 1, j + 1):
1472-
ranks[i, argsort_indexer[i, z]] = j - dups + 2
1473-
elif tiebreak == TIEBREAK_MAX:
1474-
for z in range(j - dups + 1, j + 1):
1475-
ranks[i, argsort_indexer[i, z]] = j + 1
1476-
elif tiebreak == TIEBREAK_FIRST:
1477-
if rank_t is object:
1478-
raise ValueError('first not supported for non-numeric data')
1479-
else:
1480-
for z in range(j - dups + 1, j + 1):
1481-
ranks[i, argsort_indexer[i, z]] = z + 1
1482-
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
1483-
for z in range(j - dups + 1, j + 1):
1484-
ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2
1485-
elif tiebreak == TIEBREAK_DENSE:
1486-
total_tie_count += 1
1487-
for z in range(j - dups + 1, j + 1):
1488-
ranks[i, argsort_indexer[i, z]] = total_tie_count
1489-
sum_ranks = dups = 0
1490-
if pct:
1491-
if tiebreak == TIEBREAK_DENSE:
1492-
ranks[i, :] /= total_tie_count
1493-
else:
1494-
ranks[i, :] /= count
1495-
if axis == 0:
1496-
return ranks.T
1442+
# putmask doesn't accept a memoryview, so we assign in a separate step
1443+
masked_vals = values
1444+
with nogil:
1445+
for col in range(k):
1446+
rank_sorted_1d(
1447+
out[:, col],
1448+
grp_sizes[:, col],
1449+
labels,
1450+
sort_indexer[:, col],
1451+
masked_vals[:, col],
1452+
mask[:, col],
1453+
tiebreak,
1454+
check_mask,
1455+
False,
1456+
keep_na,
1457+
pct,
1458+
n,
1459+
)
1460+
1461+
if axis == 1:
1462+
return np.asarray(out.T)
14971463
else:
1498-
return ranks
1464+
return np.asarray(out)
14991465

15001466

15011467
ctypedef fused diff_t:

pandas/_libs/algos_take_helper.pxi.in

-30
Original file line numberDiff line numberDiff line change
@@ -219,33 +219,3 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
219219
{{endif}}
220220

221221
{{endfor}}
222-
223-
# ----------------------------------------------------------------------
224-
# take_2d internal function
225-
# ----------------------------------------------------------------------
226-
227-
ctypedef fused take_t:
228-
float64_t
229-
uint64_t
230-
int64_t
231-
object
232-
233-
234-
cdef _take_2d(ndarray[take_t, ndim=2] values, ndarray[intp_t, ndim=2] idx):
235-
cdef:
236-
Py_ssize_t i, j, N, K
237-
ndarray[intp_t, ndim=2, cast=True] indexer = idx
238-
ndarray[take_t, ndim=2] result
239-
240-
N, K = (<object>values).shape
241-
242-
if take_t is object:
243-
# evaluated at compile-time
244-
result = values.copy()
245-
else:
246-
result = np.empty_like(values)
247-
248-
for i in range(N):
249-
for j in range(K):
250-
result[i, j] = values[i, indexer[i, j]]
251-
return result

pandas/tests/frame/methods/test_rank.py

+33-9
Original file line numberDiff line numberDiff line change
@@ -246,23 +246,18 @@ def test_rank_methods_frame(self):
246246
expected = DataFrame(sprank, columns=cols).astype("float64")
247247
tm.assert_frame_equal(result, expected)
248248

249-
@td.skip_array_manager_not_yet_implemented
250249
@pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
251250
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
252251
def test_rank_descending(self, method, dtype):
253-
254252
if "i" in dtype:
255-
df = self.df.dropna()
253+
df = self.df.dropna().astype(dtype)
256254
else:
257255
df = self.df.astype(dtype)
258256

259257
res = df.rank(ascending=False)
260258
expected = (df.max() - df).rank()
261259
tm.assert_frame_equal(res, expected)
262260

263-
if method == "first" and dtype == "O":
264-
return
265-
266261
expected = (df.max() - df).rank(method=method)
267262

268263
if dtype != "O":
@@ -287,9 +282,6 @@ def _check2d(df, expected, method="average", axis=0):
287282
result = df.rank(method=method, axis=axis)
288283
tm.assert_frame_equal(result, exp_df)
289284

290-
disabled = {(object, "first")}
291-
if (dtype, method) in disabled:
292-
return
293285
frame = df if dtype is None else df.astype(dtype)
294286
_check2d(frame, self.results[method], method=method, axis=axis)
295287

@@ -456,6 +448,38 @@ def test_rank_both_inf(self):
456448
result = df.rank()
457449
tm.assert_frame_equal(result, expected)
458450

451+
@pytest.mark.parametrize(
452+
"na_option,ascending,expected",
453+
[
454+
("top", True, [3.0, 1.0, 2.0]),
455+
("top", False, [2.0, 1.0, 3.0]),
456+
("bottom", True, [2.0, 3.0, 1.0]),
457+
("bottom", False, [1.0, 3.0, 2.0]),
458+
],
459+
)
460+
def test_rank_inf_nans_na_option(
461+
self, frame_or_series, method, na_option, ascending, expected
462+
):
463+
obj = frame_or_series([np.inf, np.nan, -np.inf])
464+
result = obj.rank(method=method, na_option=na_option, ascending=ascending)
465+
expected = frame_or_series(expected)
466+
tm.assert_equal(result, expected)
467+
468+
@pytest.mark.parametrize(
469+
"na_option,ascending,expected",
470+
[
471+
("bottom", True, [1.0, 2.0, 4.0, 3.0]),
472+
("bottom", False, [1.0, 2.0, 4.0, 3.0]),
473+
("top", True, [2.0, 3.0, 1.0, 4.0]),
474+
("top", False, [2.0, 3.0, 1.0, 4.0]),
475+
],
476+
)
477+
def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
478+
obj = frame_or_series(["foo", "foo", None, "foo"])
479+
result = obj.rank(method="first", na_option=na_option, ascending=ascending)
480+
expected = frame_or_series(expected)
481+
tm.assert_equal(result, expected)
482+
459483
@pytest.mark.parametrize(
460484
"data,expected",
461485
[

0 commit comments

Comments
 (0)