Skip to content

Commit 47bb52f

Browse files
mzeitlin11JulianWgs
authored andcommitted
CLN: allow label memviews to be None (pandas-dev#42260)
1 parent 04727cc commit 47bb52f

File tree

4 files changed

+52
-61
lines changed

4 files changed

+52
-61
lines changed

pandas/_libs/algos.pyi

+1-1
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def is_monotonic(
123123

124124
def rank_1d(
125125
values: np.ndarray, # ndarray[rank_t, ndim=1]
126-
labels: np.ndarray, # const int64_t[:]
126+
labels: np.ndarray | None = ..., # const int64_t[:]=None
127127
is_datetimelike: bool = ...,
128128
ties_method=...,
129129
ascending: bool = ...,

pandas/_libs/algos.pyx

+50-58
Original file line numberDiff line numberDiff line change
@@ -389,11 +389,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
389389
int64_t nobs = 0
390390
bint no_nans
391391
float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
392-
const int64_t[:] labels_n, labels_nobs
393392

394393
N, K = (<object>mat).shape
395-
# For compatibility when calling rank_1d
396-
labels_n = np.zeros(N, dtype=np.int64)
397394

398395
# Handle the edge case where we know all results will be nan
399396
# to keep conditional logic inside loop simpler
@@ -412,7 +409,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
412409
maskedx = np.empty(N, dtype=np.float64)
413410
maskedy = np.empty(N, dtype=np.float64)
414411
for i in range(K):
415-
ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n)
412+
ranked_mat[:, i] = rank_1d(mat[:, i])
416413

417414
with nogil:
418415
for xi in range(K):
@@ -451,11 +448,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
451448
with gil:
452449
# We need to slice back to nobs because rank_1d will
453450
# require arrays of nobs length
454-
labels_nobs = np.zeros(nobs, dtype=np.int64)
455-
rankedx = rank_1d(np.array(maskedx)[:nobs],
456-
labels=labels_nobs)
457-
rankedy = rank_1d(np.array(maskedy)[:nobs],
458-
labels=labels_nobs)
451+
rankedx = rank_1d(np.asarray(maskedx)[:nobs])
452+
rankedy = rank_1d(np.asarray(maskedy)[:nobs])
459453
for i in range(nobs):
460454
maskedx[i] = rankedx[i]
461455
maskedy[i] = rankedy[i]
@@ -518,19 +512,16 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
518512
int64_t total_discordant = 0
519513
float64_t kendall_tau
520514
int64_t n_obs
521-
const intp_t[:] labels_n
522515

523516
N, K = (<object>mat).shape
524517

525518
result = np.empty((K, K), dtype=np.float64)
526519
mask = np.isfinite(mat)
527520

528521
ranked_mat = np.empty((N, K), dtype=np.float64)
529-
# For compatibility when calling rank_1d
530-
labels_n = np.zeros(N, dtype=np.intp)
531522

532523
for i in range(K):
533-
ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
524+
ranked_mat[:, i] = rank_1d(mat[:, i])
534525

535526
for xi in range(K):
536527
sorted_idxs = ranked_mat[:, xi].argsort()
@@ -961,7 +952,7 @@ cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
961952
@cython.boundscheck(False)
962953
def rank_1d(
963954
ndarray[rank_t, ndim=1] values,
964-
const intp_t[:] labels,
955+
const intp_t[:] labels=None,
965956
bint is_datetimelike=False,
966957
ties_method="average",
967958
bint ascending=True,
@@ -974,10 +965,10 @@ def rank_1d(
974965
Parameters
975966
----------
976967
values : array of rank_t values to be ranked
977-
labels : np.ndarray[np.intp]
968+
labels : np.ndarray[np.intp] or None
978969
Array containing unique label for each group, with its ordering
979970
matching up to the corresponding record in `values`. If not called
980-
from a groupby operation, will be an array of 0's
971+
from a groupby operation, will be None.
981972
is_datetimelike : bool, default False
982973
True if `values` contains datetime-like entries.
983974
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
@@ -1017,14 +1008,15 @@ def rank_1d(
10171008
keep_na = na_option == 'keep'
10181009

10191010
N = len(values)
1020-
# TODO Cython 3.0: cast won't be necessary (#2992)
1021-
assert <Py_ssize_t>len(labels) == N
1011+
if labels is not None:
1012+
# TODO Cython 3.0: cast won't be necessary (#2992)
1013+
assert <Py_ssize_t>len(labels) == N
10221014
out = np.empty(N)
10231015
grp_sizes = np.ones(N, dtype=np.int64)
10241016

1025-
# If all 0 labels, can short-circuit later label
1017+
# If we don't care about labels, can short-circuit later label
10261018
# comparisons
1027-
check_labels = np.any(labels)
1019+
check_labels = labels is not None
10281020

10291021
# For cases where a mask is not possible, we can avoid mask checks
10301022
check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
@@ -1055,9 +1047,12 @@ def rank_1d(
10551047
nans_rank_highest = ascending ^ (na_option == 'top')
10561048
nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
10571049
if nans_rank_highest:
1058-
order = (masked_vals, mask, labels)
1050+
order = [masked_vals, mask]
10591051
else:
1060-
order = (masked_vals, ~(np.asarray(mask)), labels)
1052+
order = [masked_vals, ~(np.asarray(mask))]
1053+
1054+
if check_labels:
1055+
order.append(labels)
10611056

10621057
np.putmask(masked_vals, mask, nan_fill_val)
10631058
# putmask doesn't accept a memoryview, so we assign as a separate step
@@ -1076,16 +1071,15 @@ def rank_1d(
10761071
rank_sorted_1d(
10771072
out,
10781073
grp_sizes,
1079-
labels,
10801074
lexsort_indexer,
10811075
masked_vals_memview,
10821076
mask,
1083-
tiebreak,
1084-
check_mask,
1085-
check_labels,
1086-
keep_na,
1087-
pct,
1088-
N,
1077+
check_mask=check_mask,
1078+
N=N,
1079+
tiebreak=tiebreak,
1080+
keep_na=keep_na,
1081+
pct=pct,
1082+
labels=labels,
10891083
)
10901084

10911085
return np.asarray(out)
@@ -1096,17 +1090,18 @@ def rank_1d(
10961090
cdef void rank_sorted_1d(
10971091
float64_t[::1] out,
10981092
int64_t[::1] grp_sizes,
1099-
const intp_t[:] labels,
11001093
const intp_t[:] sort_indexer,
11011094
# Can make const with cython3 (https://github.com/cython/cython/issues/3222)
11021095
rank_t[:] masked_vals,
11031096
const uint8_t[:] mask,
1104-
TiebreakEnumType tiebreak,
11051097
bint check_mask,
1106-
bint check_labels,
1107-
bint keep_na,
1108-
bint pct,
11091098
Py_ssize_t N,
1099+
TiebreakEnumType tiebreak=TIEBREAK_AVERAGE,
1100+
bint keep_na=True,
1101+
bint pct=False,
1102+
# https://github.com/cython/cython/issues/1630, only trailing arguments can
1103+
# currently be omitted for cdef functions, which is why we keep this at the end
1104+
const intp_t[:] labels=None,
11101105
) nogil:
11111106
"""
11121107
See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should
@@ -1117,35 +1112,36 @@ cdef void rank_sorted_1d(
11171112
out : float64_t[::1]
11181113
Array to store computed ranks
11191114
grp_sizes : int64_t[::1]
1120-
Array to store group counts, only used if pct=True
1121-
labels : See rank_1d.__doc__
1115+
Array to store group counts, only used if pct=True. Should only be None
1116+
if labels is None.
11221117
sort_indexer : intp_t[:]
11231118
Array of indices which sorts masked_vals
11241119
masked_vals : rank_t[:]
11251120
The values input to rank_1d, with missing values replaced by fill values
11261121
mask : uint8_t[:]
1127-
Array where entries are True if the value is missing, False otherwise
1128-
tiebreak : TiebreakEnumType
1129-
See rank_1d.__doc__ for the different modes
1122+
Array where entries are True if the value is missing, False otherwise.
11301123
check_mask : bool
11311124
If False, assumes the mask is all False to skip mask indexing
1132-
check_labels : bool
1133-
If False, assumes all labels are the same to skip group handling logic
1134-
keep_na : bool
1135-
Whether or not to keep nulls
1136-
pct : bool
1137-
Compute percentage rank of data within each group
11381125
N : Py_ssize_t
11391126
The number of elements to rank. Note: it is not always true that
11401127
N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
1128+
tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE
1129+
See rank_1d.__doc__ for the different modes
1130+
keep_na : bool, default True
1131+
Whether or not to keep nulls
1132+
pct : bool, default False
1133+
Compute percentage rank of data within each group
1134+
labels : See rank_1d.__doc__, default None. None implies all labels are the same.
11411135
"""
11421136

11431137
cdef:
11441138
Py_ssize_t i, j, dups=0, sum_ranks=0,
11451139
Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0
1146-
bint at_end, next_val_diff, group_changed
1140+
bint at_end, next_val_diff, group_changed, check_labels
11471141
int64_t grp_size
11481142

1143+
check_labels = labels is not None
1144+
11491145
# Loop over the length of the value array
11501146
# each incremental i value can be looked up in the lexsort_indexer
11511147
# array that we sorted previously, which gives us the location of
@@ -1374,8 +1370,7 @@ def rank_2d(
13741370
cdef:
13751371
Py_ssize_t k, n, col
13761372
float64_t[::1, :] out # Column-major so columns are contiguous
1377-
int64_t[::1, :] grp_sizes
1378-
const intp_t[:] labels
1373+
int64_t[::1] grp_sizes
13791374
ndarray[rank_t, ndim=2] values
13801375
rank_t[:, :] masked_vals
13811376
intp_t[:, :] sort_indexer
@@ -1426,8 +1421,7 @@ def rank_2d(
14261421

14271422
n, k = (<object>values).shape
14281423
out = np.empty((n, k), dtype='f8', order='F')
1429-
grp_sizes = np.ones((n, k), dtype='i8', order='F')
1430-
labels = np.zeros(n, dtype=np.intp)
1424+
grp_sizes = np.ones(n, dtype=np.int64)
14311425

14321426
# lexsort is slower, so only use if we need to worry about the mask
14331427
if check_mask:
@@ -1445,17 +1439,15 @@ def rank_2d(
14451439
for col in range(k):
14461440
rank_sorted_1d(
14471441
out[:, col],
1448-
grp_sizes[:, col],
1449-
labels,
1442+
grp_sizes,
14501443
sort_indexer[:, col],
14511444
masked_vals[:, col],
14521445
mask[:, col],
1453-
tiebreak,
1454-
check_mask,
1455-
False,
1456-
keep_na,
1457-
pct,
1458-
n,
1446+
check_mask=check_mask,
1447+
N=n,
1448+
tiebreak=tiebreak,
1449+
keep_na=keep_na,
1450+
pct=pct,
14591451
)
14601452

14611453
if axis == 1:

pandas/core/algorithms.py

-1
Original file line numberDiff line numberDiff line change
@@ -1008,7 +1008,6 @@ def rank(
10081008
if values.ndim == 1:
10091009
ranks = algos.rank_1d(
10101010
values,
1011-
labels=np.zeros(len(values), dtype=np.intp),
10121011
is_datetimelike=is_datetimelike,
10131012
ties_method=method,
10141013
ascending=ascending,

pandas/tests/test_algos.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1756,7 +1756,7 @@ def test_scipy_compat(self):
17561756
def _check(arr):
17571757
mask = ~np.isfinite(arr)
17581758
arr = arr.copy()
1759-
result = libalgos.rank_1d(arr, labels=np.zeros(len(arr), dtype=np.intp))
1759+
result = libalgos.rank_1d(arr)
17601760
arr[mask] = np.inf
17611761
exp = rankdata(arr)
17621762
exp[mask] = np.nan

0 commit comments

Comments
 (0)