@@ -389,11 +389,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
389
389
int64_t nobs = 0
390
390
bint no_nans
391
391
float64_t vx , vy , sumx , sumxx , sumyy , mean , divisor
392
- const int64_t[:] labels_n , labels_nobs
393
392
394
393
N , K = (< object > mat).shape
395
- # For compatibility when calling rank_1d
396
- labels_n = np.zeros(N, dtype = np.int64)
397
394
398
395
# Handle the edge case where we know all results will be nan
399
396
# to keep conditional logic inside loop simpler
@@ -412,7 +409,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
412
409
maskedx = np.empty(N, dtype = np.float64)
413
410
maskedy = np.empty(N, dtype = np.float64)
414
411
for i in range(K ):
415
- ranked_mat[:, i] = rank_1d(mat[:, i], labels = labels_n )
412
+ ranked_mat[:, i] = rank_1d(mat[:, i])
416
413
417
414
with nogil:
418
415
for xi in range (K):
@@ -451,11 +448,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
451
448
with gil:
452
449
# We need to slice back to nobs because rank_1d will
453
450
# require arrays of nobs length
454
- labels_nobs = np.zeros(nobs, dtype = np.int64)
455
- rankedx = rank_1d(np.array(maskedx)[:nobs],
456
- labels = labels_nobs)
457
- rankedy = rank_1d(np.array(maskedy)[:nobs],
458
- labels = labels_nobs)
451
+ rankedx = rank_1d(np.asarray(maskedx)[:nobs])
452
+ rankedy = rank_1d(np.asarray(maskedy)[:nobs])
459
453
for i in range (nobs):
460
454
maskedx[i] = rankedx[i]
461
455
maskedy[i] = rankedy[i]
@@ -518,19 +512,16 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
518
512
int64_t total_discordant = 0
519
513
float64_t kendall_tau
520
514
int64_t n_obs
521
- const intp_t[:] labels_n
522
515
523
516
N , K = (< object > mat).shape
524
517
525
518
result = np.empty((K, K), dtype = np.float64)
526
519
mask = np.isfinite(mat)
527
520
528
521
ranked_mat = np.empty((N, K), dtype = np.float64)
529
- # For compatibility when calling rank_1d
530
- labels_n = np.zeros(N, dtype = np.intp)
531
522
532
523
for i in range(K ):
533
- ranked_mat[:, i] = rank_1d(mat[:, i], labels_n )
524
+ ranked_mat[:, i] = rank_1d(mat[:, i])
534
525
535
526
for xi in range (K):
536
527
sorted_idxs = ranked_mat[:, xi].argsort()
@@ -961,7 +952,7 @@ cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
961
952
@ cython.boundscheck (False )
962
953
def rank_1d (
963
954
ndarray[rank_t , ndim = 1 ] values,
964
- const intp_t[:] labels ,
955
+ const intp_t[:] labels = None ,
965
956
bint is_datetimelike = False ,
966
957
ties_method = " average" ,
967
958
bint ascending = True ,
@@ -974,10 +965,10 @@ def rank_1d(
974
965
Parameters
975
966
----------
976
967
values : array of rank_t values to be ranked
977
- labels : np.ndarray[np.intp]
968
+ labels : np.ndarray[np.intp] or None
978
969
Array containing unique label for each group, with its ordering
979
970
matching up to the corresponding record in `values`. If not called
980
- from a groupby operation, will be an array of 0's
971
+ from a groupby operation, will be None.
981
972
is_datetimelike : bool, default False
982
973
True if `values` contains datetime-like entries.
983
974
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
@@ -1017,14 +1008,15 @@ def rank_1d(
1017
1008
keep_na = na_option == ' keep'
1018
1009
1019
1010
N = len (values)
1020
- # TODO Cython 3.0: cast won't be necessary (#2992)
1021
- assert < Py_ssize_t> len (labels) == N
1011
+ if labels is not None :
1012
+ # TODO Cython 3.0: cast won't be necessary (#2992)
1013
+ assert < Py_ssize_t> len (labels) == N
1022
1014
out = np.empty(N)
1023
1015
grp_sizes = np.ones(N, dtype = np.int64)
1024
1016
1025
- # If all 0 labels, can short-circuit later label
1017
+ # If we don't care about labels, can short-circuit later label
1026
1018
# comparisons
1027
- check_labels = np.any( labels)
1019
+ check_labels = labels is not None
1028
1020
1029
1021
# For cases where a mask is not possible, we can avoid mask checks
1030
1022
check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
@@ -1055,9 +1047,12 @@ def rank_1d(
1055
1047
nans_rank_highest = ascending ^ (na_option == ' top' )
1056
1048
nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
1057
1049
if nans_rank_highest:
1058
- order = ( masked_vals, mask, labels)
1050
+ order = [ masked_vals, mask]
1059
1051
else :
1060
- order = (masked_vals, ~ (np.asarray(mask)), labels)
1052
+ order = [masked_vals, ~ (np.asarray(mask))]
1053
+
1054
+ if check_labels:
1055
+ order.append(labels)
1061
1056
1062
1057
np.putmask(masked_vals, mask, nan_fill_val)
1063
1058
# putmask doesn't accept a memoryview, so we assign as a separate step
@@ -1076,16 +1071,15 @@ def rank_1d(
1076
1071
rank_sorted_1d(
1077
1072
out,
1078
1073
grp_sizes,
1079
- labels,
1080
1074
lexsort_indexer,
1081
1075
masked_vals_memview,
1082
1076
mask,
1083
- tiebreak ,
1084
- check_mask ,
1085
- check_labels ,
1086
- keep_na,
1087
- pct,
1088
- N ,
1077
+ check_mask = check_mask ,
1078
+ N = N ,
1079
+ tiebreak = tiebreak ,
1080
+ keep_na = keep_na ,
1081
+ pct = pct ,
1082
+ labels = labels ,
1089
1083
)
1090
1084
1091
1085
return np.asarray(out)
@@ -1096,17 +1090,18 @@ def rank_1d(
1096
1090
cdef void rank_sorted_1d(
1097
1091
float64_t[::1 ] out,
1098
1092
int64_t[::1 ] grp_sizes,
1099
- const intp_t[:] labels,
1100
1093
const intp_t[:] sort_indexer,
1101
1094
# Can make const with cython3 (https://github.com/cython/cython/issues/3222)
1102
1095
rank_t[:] masked_vals,
1103
1096
const uint8_t[:] mask,
1104
- TiebreakEnumType tiebreak,
1105
1097
bint check_mask,
1106
- bint check_labels,
1107
- bint keep_na,
1108
- bint pct,
1109
1098
Py_ssize_t N,
1099
+ TiebreakEnumType tiebreak = TIEBREAK_AVERAGE,
1100
+ bint keep_na = True ,
1101
+ bint pct = False ,
1102
+ # https://github.com/cython/cython/issues/1630, only trailing arguments can
1103
+ # currently be omitted for cdef functions, which is why we keep this at the end
1104
+ const intp_t[:] labels = None ,
1110
1105
) nogil:
1111
1106
"""
1112
1107
See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should
@@ -1117,35 +1112,36 @@ cdef void rank_sorted_1d(
1117
1112
out : float64_t[::1]
1118
1113
Array to store computed ranks
1119
1114
grp_sizes : int64_t[::1]
1120
- Array to store group counts, only used if pct=True
1121
- labels : See rank_1d.__doc__
1115
+ Array to store group counts, only used if pct=True. Should only be None
1116
+ if labels is None.
1122
1117
sort_indexer : intp_t[:]
1123
1118
Array of indices which sorts masked_vals
1124
1119
masked_vals : rank_t[:]
1125
1120
The values input to rank_1d, with missing values replaced by fill values
1126
1121
mask : uint8_t[:]
1127
- Array where entries are True if the value is missing, False otherwise
1128
- tiebreak : TiebreakEnumType
1129
- See rank_1d.__doc__ for the different modes
1122
+ Array where entries are True if the value is missing, False otherwise.
1130
1123
check_mask : bool
1131
1124
If False, assumes the mask is all False to skip mask indexing
1132
- check_labels : bool
1133
- If False, assumes all labels are the same to skip group handling logic
1134
- keep_na : bool
1135
- Whether or not to keep nulls
1136
- pct : bool
1137
- Compute percentage rank of data within each group
1138
1125
N : Py_ssize_t
1139
1126
The number of elements to rank. Note: it is not always true that
1140
1127
N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
1128
+ tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE
1129
+ See rank_1d.__doc__ for the different modes
1130
+ keep_na : bool, default True
1131
+ Whether or not to keep nulls
1132
+ pct : bool, default False
1133
+ Compute percentage rank of data within each group
1134
+ labels : See rank_1d.__doc__, default None. None implies all labels are the same.
1141
1135
"""
1142
1136
1143
1137
cdef:
1144
1138
Py_ssize_t i, j, dups= 0 , sum_ranks= 0 ,
1145
1139
Py_ssize_t grp_start= 0 , grp_vals_seen= 1 , grp_na_count= 0
1146
- bint at_end, next_val_diff, group_changed
1140
+ bint at_end, next_val_diff, group_changed, check_labels
1147
1141
int64_t grp_size
1148
1142
1143
+ check_labels = labels is not None
1144
+
1149
1145
# Loop over the length of the value array
1150
1146
# each incremental i value can be looked up in the lexsort_indexer
1151
1147
# array that we sorted previously, which gives us the location of
@@ -1374,8 +1370,7 @@ def rank_2d(
1374
1370
cdef:
1375
1371
Py_ssize_t k, n, col
1376
1372
float64_t[::1 , :] out # Column-major so columns are contiguous
1377
- int64_t[::1 , :] grp_sizes
1378
- const intp_t[:] labels
1373
+ int64_t[::1 ] grp_sizes
1379
1374
ndarray[rank_t, ndim= 2 ] values
1380
1375
rank_t[:, :] masked_vals
1381
1376
intp_t[:, :] sort_indexer
@@ -1426,8 +1421,7 @@ def rank_2d(
1426
1421
1427
1422
n, k = (< object > values).shape
1428
1423
out = np.empty((n, k), dtype = ' f8' , order = ' F' )
1429
- grp_sizes = np.ones((n, k), dtype = ' i8' , order = ' F' )
1430
- labels = np.zeros(n, dtype = np.intp)
1424
+ grp_sizes = np.ones(n, dtype = np.int64)
1431
1425
1432
1426
# lexsort is slower, so only use if we need to worry about the mask
1433
1427
if check_mask:
@@ -1445,17 +1439,15 @@ def rank_2d(
1445
1439
for col in range (k):
1446
1440
rank_sorted_1d(
1447
1441
out[:, col],
1448
- grp_sizes[:, col],
1449
- labels,
1442
+ grp_sizes,
1450
1443
sort_indexer[:, col],
1451
1444
masked_vals[:, col],
1452
1445
mask[:, col],
1453
- tiebreak,
1454
- check_mask,
1455
- False ,
1456
- keep_na,
1457
- pct,
1458
- n,
1446
+ check_mask = check_mask,
1447
+ N = n,
1448
+ tiebreak = tiebreak,
1449
+ keep_na = keep_na,
1450
+ pct = pct,
1459
1451
)
1460
1452
1461
1453
if axis == 1 :
0 commit comments