@@ -1372,26 +1372,29 @@ def rank_2d(
1372
1372
Fast NaN-friendly version of ``scipy.stats.rankdata``.
1373
1373
"""
1374
1374
cdef:
1375
- Py_ssize_t i, j, z, k, n, dups = 0 , total_tie_count = 0
1376
- Py_ssize_t infs
1377
- ndarray[float64_t, ndim= 2 ] ranks
1375
+ Py_ssize_t k, n, col
1376
+ float64_t[::1 , :] out # Column-major so columns are contiguous
1377
+ int64_t[::1 , :] grp_sizes
1378
+ const intp_t[:] labels
1378
1379
ndarray[rank_t, ndim= 2 ] values
1379
- ndarray[intp_t, ndim= 2 ] argsort_indexer
1380
- ndarray[uint8_t, ndim= 2 ] mask
1381
- rank_t val, nan_fill_val
1382
- float64_t count, sum_ranks = 0.0
1383
- int tiebreak = 0
1384
- int64_t idx
1385
- bint check_mask, condition, keep_na, nans_rank_highest
1380
+ rank_t[:, :] masked_vals
1381
+ intp_t[:, :] sort_indexer
1382
+ uint8_t[:, :] mask
1383
+ TiebreakEnumType tiebreak
1384
+ bint check_mask, keep_na, nans_rank_highest
1385
+ rank_t nan_fill_val
1386
1386
1387
1387
tiebreak = tiebreakers[ties_method]
1388
+ if tiebreak == TIEBREAK_FIRST:
1389
+ if not ascending:
1390
+ tiebreak = TIEBREAK_FIRST_DESCENDING
1388
1391
1389
1392
keep_na = na_option == ' keep'
1390
1393
1391
1394
# For cases where a mask is not possible, we can avoid mask checks
1392
1395
check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
1393
1396
1394
- if axis == 0 :
1397
+ if axis == 1 :
1395
1398
values = np.asarray(in_arr).T.copy()
1396
1399
else :
1397
1400
values = np.asarray(in_arr).copy()
@@ -1403,99 +1406,62 @@ def rank_2d(
1403
1406
nans_rank_highest = ascending ^ (na_option == ' top' )
1404
1407
if check_mask:
1405
1408
nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
1409
+
1406
1410
if rank_t is object :
1407
- mask = missing.isnaobj2d(values)
1411
+ mask = missing.isnaobj2d(values).view(np.uint8)
1408
1412
elif rank_t is float64_t:
1409
- mask = np.isnan(values)
1413
+ mask = np.isnan(values).view(np.uint8)
1410
1414
1411
1415
# int64 and datetimelike
1412
1416
else :
1413
- mask = values == NPY_NAT
1414
-
1417
+ mask = (values == NPY_NAT).view(np.uint8)
1415
1418
np.putmask(values, mask, nan_fill_val)
1416
1419
else :
1417
- mask = np.zeros_like(values, dtype = bool )
1420
+ mask = np.zeros_like(values, dtype = np.uint8)
1421
+
1422
+ if nans_rank_highest:
1423
+ order = (values, mask)
1424
+ else :
1425
+ order = (values, ~ np.asarray(mask))
1418
1426
1419
1427
n, k = (< object > values).shape
1420
- ranks = np.empty((n, k), dtype = ' f8' )
1428
+ out = np.empty((n, k), dtype = ' f8' , order = ' F' )
1429
+ grp_sizes = np.ones((n, k), dtype = ' i8' , order = ' F' )
1430
+ labels = np.zeros(n, dtype = np.intp)
1421
1431
1422
- if tiebreak == TIEBREAK_FIRST:
1423
- # need to use a stable sort here
1424
- argsort_indexer = values.argsort(axis = 1 , kind = ' mergesort' )
1425
- if not ascending:
1426
- tiebreak = TIEBREAK_FIRST_DESCENDING
1432
+ # lexsort is slower, so only use if we need to worry about the mask
1433
+ if check_mask:
1434
+ sort_indexer = np.lexsort(order, axis = 0 ).astype(np.intp, copy = False )
1427
1435
else :
1428
- argsort_indexer = values.argsort(1 )
1436
+ kind = " stable" if ties_method == " first" else None
1437
+ sort_indexer = values.argsort(axis = 0 , kind = kind).astype(np.intp, copy = False )
1429
1438
1430
1439
if not ascending:
1431
- argsort_indexer = argsort_indexer[:, ::- 1 ]
1432
-
1433
- values = _take_2d(values, argsort_indexer)
1440
+ sort_indexer = sort_indexer[::- 1 , :]
1434
1441
1435
- for i in range (n):
1436
- dups = sum_ranks = infs = 0
1437
-
1438
- total_tie_count = 0
1439
- count = 0.0
1440
- for j in range (k):
1441
- val = values[i, j]
1442
- idx = argsort_indexer[i, j]
1443
- if keep_na and check_mask and mask[i, idx]:
1444
- ranks[i, idx] = NaN
1445
- infs += 1
1446
- continue
1447
-
1448
- count += 1.0
1449
-
1450
- sum_ranks += (j - infs) + 1
1451
- dups += 1
1452
-
1453
- if rank_t is object :
1454
- condition = (
1455
- j == k - 1 or
1456
- are_diff(values[i, j + 1 ], val) or
1457
- (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1 ]])
1458
- )
1459
- else :
1460
- condition = (
1461
- j == k - 1 or
1462
- values[i, j + 1 ] != val or
1463
- (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1 ]])
1464
- )
1465
-
1466
- if condition:
1467
- if tiebreak == TIEBREAK_AVERAGE:
1468
- for z in range (j - dups + 1 , j + 1 ):
1469
- ranks[i, argsort_indexer[i, z]] = sum_ranks / dups
1470
- elif tiebreak == TIEBREAK_MIN:
1471
- for z in range (j - dups + 1 , j + 1 ):
1472
- ranks[i, argsort_indexer[i, z]] = j - dups + 2
1473
- elif tiebreak == TIEBREAK_MAX:
1474
- for z in range (j - dups + 1 , j + 1 ):
1475
- ranks[i, argsort_indexer[i, z]] = j + 1
1476
- elif tiebreak == TIEBREAK_FIRST:
1477
- if rank_t is object :
1478
- raise ValueError (' first not supported for non-numeric data' )
1479
- else :
1480
- for z in range (j - dups + 1 , j + 1 ):
1481
- ranks[i, argsort_indexer[i, z]] = z + 1
1482
- elif tiebreak == TIEBREAK_FIRST_DESCENDING:
1483
- for z in range (j - dups + 1 , j + 1 ):
1484
- ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2
1485
- elif tiebreak == TIEBREAK_DENSE:
1486
- total_tie_count += 1
1487
- for z in range (j - dups + 1 , j + 1 ):
1488
- ranks[i, argsort_indexer[i, z]] = total_tie_count
1489
- sum_ranks = dups = 0
1490
- if pct:
1491
- if tiebreak == TIEBREAK_DENSE:
1492
- ranks[i, :] /= total_tie_count
1493
- else :
1494
- ranks[i, :] /= count
1495
- if axis == 0 :
1496
- return ranks.T
1442
+ # putmask doesn't accept a memoryview, so we assign in a separate step
1443
+ masked_vals = values
1444
+ with nogil:
1445
+ for col in range (k):
1446
+ rank_sorted_1d(
1447
+ out[:, col],
1448
+ grp_sizes[:, col],
1449
+ labels,
1450
+ sort_indexer[:, col],
1451
+ masked_vals[:, col],
1452
+ mask[:, col],
1453
+ tiebreak,
1454
+ check_mask,
1455
+ False ,
1456
+ keep_na,
1457
+ pct,
1458
+ n,
1459
+ )
1460
+
1461
+ if axis == 1 :
1462
+ return np.asarray(out.T)
1497
1463
else :
1498
- return ranks
1464
+ return np.asarray(out)
1499
1465
1500
1466
1501
1467
ctypedef fused diff_t:
0 commit comments