Skip to content

CLN: allow label memviews to be None #42260

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Jul 1, 2021
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
c7a91ac
REF: split out sorted_rank algo
mzeitlin11 Jun 9, 2021
4b0641e
Fixup docstring
mzeitlin11 Jun 9, 2021
b6dd4a6
WIP
mzeitlin11 Jun 9, 2021
953b188
WIP
mzeitlin11 Jun 10, 2021
254b997
premerge
mzeitlin11 Jun 10, 2021
5602dca
Merge remote-tracking branch 'upstream/master' into ref/rank_2d_dedup
mzeitlin11 Jun 10, 2021
29dc590
REF: give ranks same nan filling
mzeitlin11 Jun 10, 2021
9abd9da
Merge branch 'rank_2d_dedup_chunk' into ref/rank_2d_dedup
mzeitlin11 Jun 10, 2021
974650d
WIP
mzeitlin11 Jun 10, 2021
b840b74
Handle empty case early
mzeitlin11 Jun 10, 2021
f099bb0
Handle empty case early
mzeitlin11 Jun 10, 2021
4aa4f8b
WIP
mzeitlin11 Jun 10, 2021
c5ed688
WIP
mzeitlin11 Jun 10, 2021
7a04159
Add object first test
mzeitlin11 Jun 10, 2021
ab9989e
Add back nogil
mzeitlin11 Jun 10, 2021
5ba6459
Add whatsnew
mzeitlin11 Jun 10, 2021
6154004
Cleaner fused type handling
mzeitlin11 Jun 10, 2021
d678bbf
Merge branch 'rank_2d_dedup_chunk' into ref/rank_2d_dedup
mzeitlin11 Jun 10, 2021
0f8744d
Add comment
mzeitlin11 Jun 10, 2021
d47f2a6
Update whatsnew
mzeitlin11 Jun 10, 2021
da61fb8
Try 32-bit fix
mzeitlin11 Jun 10, 2021
e2d9617
Debug 32-bit
mzeitlin11 Jun 10, 2021
b4d11a4
Debug 32-bit
mzeitlin11 Jun 11, 2021
9a94724
Merge remote-tracking branch 'upstream/master' into ref/rank_2d_dedup
mzeitlin11 Jun 17, 2021
1e47dae
Move whatsnew
mzeitlin11 Jun 17, 2021
8d038af
WIP
mzeitlin11 Jun 17, 2021
f90b8d9
Clean up group sizes
mzeitlin11 Jun 17, 2021
f70cd30
CLN: allow memviews to be None
mzeitlin11 Jun 26, 2021
d9b2342
Fixups
mzeitlin11 Jun 26, 2021
c1c741c
Merge remote-tracking branch 'upstream/master' into ref/memviews_none…
mzeitlin11 Jun 29, 2021
e92d7c5
array -> asarray and better arg calling
mzeitlin11 Jun 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/algos.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def is_monotonic(

def rank_1d(
values: np.ndarray, # ndarray[rank_t, ndim=1]
labels: np.ndarray, # const int64_t[:]
labels: np.ndarray | None = ..., # const int64_t[:]=None
is_datetimelike: bool = ...,
ties_method=...,
ascending: bool = ...,
Expand Down
100 changes: 46 additions & 54 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -389,11 +389,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
int64_t nobs = 0
bint no_nans
float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
const int64_t[:] labels_n, labels_nobs

N, K = (<object>mat).shape
# For compatibility when calling rank_1d
labels_n = np.zeros(N, dtype=np.int64)

# Handle the edge case where we know all results will be nan
# to keep conditional logic inside loop simpler
Expand All @@ -412,7 +409,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
maskedx = np.empty(N, dtype=np.float64)
maskedy = np.empty(N, dtype=np.float64)
for i in range(K):
ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n)
ranked_mat[:, i] = rank_1d(mat[:, i])

with nogil:
for xi in range(K):
Expand Down Expand Up @@ -451,11 +448,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
with gil:
# We need to slice back to nobs because rank_1d will
# require arrays of nobs length
labels_nobs = np.zeros(nobs, dtype=np.int64)
rankedx = rank_1d(np.array(maskedx)[:nobs],
labels=labels_nobs)
rankedy = rank_1d(np.array(maskedy)[:nobs],
labels=labels_nobs)
rankedx = rank_1d(np.array(maskedx)[:nobs])
rankedy = rank_1d(np.array(maskedy)[:nobs])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure if it makes a difference, but np.array -> np.asarray?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch, was an oversight no reason to copy here

for i in range(nobs):
maskedx[i] = rankedx[i]
maskedy[i] = rankedy[i]
Expand Down Expand Up @@ -518,19 +512,16 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
int64_t total_discordant = 0
float64_t kendall_tau
int64_t n_obs
const intp_t[:] labels_n

N, K = (<object>mat).shape

result = np.empty((K, K), dtype=np.float64)
mask = np.isfinite(mat)

ranked_mat = np.empty((N, K), dtype=np.float64)
# For compatibility when calling rank_1d
labels_n = np.zeros(N, dtype=np.intp)

for i in range(K):
ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
ranked_mat[:, i] = rank_1d(mat[:, i])

for xi in range(K):
sorted_idxs = ranked_mat[:, xi].argsort()
Expand Down Expand Up @@ -961,7 +952,7 @@ cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
@cython.boundscheck(False)
def rank_1d(
ndarray[rank_t, ndim=1] values,
const intp_t[:] labels,
const intp_t[:] labels=None,
bint is_datetimelike=False,
ties_method="average",
bint ascending=True,
Expand All @@ -974,10 +965,10 @@ def rank_1d(
Parameters
----------
values : array of rank_t values to be ranked
labels : np.ndarray[np.intp]
labels : np.ndarray[np.intp] or None
Array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`. If not called
from a groupby operation, will be an array of 0's
from a groupby operation, will be None.
is_datetimelike : bool, default False
True if `values` contains datetime-like entries.
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
Expand Down Expand Up @@ -1017,14 +1008,15 @@ def rank_1d(
keep_na = na_option == 'keep'

N = len(values)
# TODO Cython 3.0: cast won't be necessary (#2992)
assert <Py_ssize_t>len(labels) == N
if labels is not None:
# TODO Cython 3.0: cast won't be necessary (#2992)
assert <Py_ssize_t>len(labels) == N
out = np.empty(N)
grp_sizes = np.ones(N, dtype=np.int64)

# If all 0 labels, can short-circuit later label
# If we don't care about labels, can short-circuit later label
# comparisons
check_labels = np.any(labels)
check_labels = labels is not None

# For cases where a mask is not possible, we can avoid mask checks
check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
Expand Down Expand Up @@ -1055,9 +1047,12 @@ def rank_1d(
nans_rank_highest = ascending ^ (na_option == 'top')
nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
if nans_rank_highest:
order = (masked_vals, mask, labels)
order = [masked_vals, mask]
else:
order = (masked_vals, ~(np.asarray(mask)), labels)
order = [masked_vals, ~(np.asarray(mask))]

if check_labels:
order.append(labels)

np.putmask(masked_vals, mask, nan_fill_val)
# putmask doesn't accept a memoryview, so we assign as a separate step
Expand All @@ -1076,16 +1071,15 @@ def rank_1d(
rank_sorted_1d(
out,
grp_sizes,
labels,
lexsort_indexer,
masked_vals_memview,
mask,
tiebreak,
check_mask,
check_labels,
keep_na,
pct,
N,
tiebreak=tiebreak,
keep_na=keep_na,
pct=pct,
labels=labels,
)

return np.asarray(out)
Expand All @@ -1096,17 +1090,18 @@ def rank_1d(
cdef void rank_sorted_1d(
float64_t[::1] out,
int64_t[::1] grp_sizes,
const intp_t[:] labels,
const intp_t[:] sort_indexer,
# Can make const with cython3 (https://github.com/cython/cython/issues/3222)
rank_t[:] masked_vals,
const uint8_t[:] mask,
TiebreakEnumType tiebreak,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as long as you're shifting these around, can you make everything from here-ish on keyword-only

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think cdef functions support keyword-only enforced by "*" (unless you have a workaround, I just tried and saw a cython compile error :)

Changed callers to treat it as keyword-only from after mask, though, which should help

bint check_mask,
bint check_labels,
bint keep_na,
bint pct,
Py_ssize_t N,
TiebreakEnumType tiebreak=TIEBREAK_AVERAGE,
bint keep_na=True,
bint pct=False,
# https://github.com/cython/cython/issues/1630, only trailing arguments can
# currently be omitted for cdef functions, which is why we keep this at the end
const intp_t[:] labels=None,
) nogil:
"""
See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should
Expand All @@ -1117,35 +1112,36 @@ cdef void rank_sorted_1d(
out : float64_t[::1]
Array to store computed ranks
grp_sizes : int64_t[::1]
Array to store group counts, only used if pct=True
labels : See rank_1d.__doc__
Array to store group counts, only used if pct=True. Should only be None
if labels is None.
sort_indexer : intp_t[:]
Array of indices which sorts masked_vals
masked_vals : rank_t[:]
The values input to rank_1d, with missing values replaced by fill values
mask : uint8_t[:]
Array where entries are True if the value is missing, False otherwise
tiebreak : TiebreakEnumType
See rank_1d.__doc__ for the different modes
Array where entries are True if the value is missing, False otherwise.
check_mask : bool
If False, assumes the mask is all False to skip mask indexing
check_labels : bool
If False, assumes all labels are the same to skip group handling logic
keep_na : bool
Whether or not to keep nulls
pct : bool
Compute percentage rank of data within each group
N : Py_ssize_t
The number of elements to rank. Note: it is not always true that
N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE
See rank_1d.__doc__ for the different modes
keep_na : bool, default True
Whether or not to keep nulls
pct : bool, default False
Compute percentage rank of data within each group
labels : See rank_1d.__doc__, default None. None implies all labels are the same.
"""

cdef:
Py_ssize_t i, j, dups=0, sum_ranks=0,
Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0
bint at_end, next_val_diff, group_changed
bint at_end, next_val_diff, group_changed, check_labels
int64_t grp_size

check_labels = labels is not None

# Loop over the length of the value array
# each incremental i value can be looked up in the lexsort_indexer
# array that we sorted previously, which gives us the location of
Expand Down Expand Up @@ -1374,8 +1370,7 @@ def rank_2d(
cdef:
Py_ssize_t k, n, col
float64_t[::1, :] out # Column-major so columns are contiguous
int64_t[::1, :] grp_sizes
const intp_t[:] labels
int64_t[::1] grp_sizes
ndarray[rank_t, ndim=2] values
rank_t[:, :] masked_vals
intp_t[:, :] sort_indexer
Expand Down Expand Up @@ -1426,8 +1421,7 @@ def rank_2d(

n, k = (<object>values).shape
out = np.empty((n, k), dtype='f8', order='F')
grp_sizes = np.ones((n, k), dtype='i8', order='F')
labels = np.zeros(n, dtype=np.intp)
grp_sizes = np.ones(n, dtype=np.int64)

# lexsort is slower, so only use if we need to worry about the mask
if check_mask:
Expand All @@ -1445,17 +1439,15 @@ def rank_2d(
for col in range(k):
rank_sorted_1d(
out[:, col],
grp_sizes[:, col],
labels,
grp_sizes,
sort_indexer[:, col],
masked_vals[:, col],
mask[:, col],
tiebreak,
check_mask,
False,
keep_na,
pct,
n,
tiebreak=tiebreak,
keep_na=keep_na,
pct=pct,
)

if axis == 1:
Expand Down
1 change: 0 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1008,7 +1008,6 @@ def rank(
if values.ndim == 1:
ranks = algos.rank_1d(
values,
labels=np.zeros(len(values), dtype=np.intp),
is_datetimelike=is_datetimelike,
ties_method=method,
ascending=ascending,
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1747,7 +1747,7 @@ def test_scipy_compat(self):
def _check(arr):
mask = ~np.isfinite(arr)
arr = arr.copy()
result = libalgos.rank_1d(arr, labels=np.zeros(len(arr), dtype=np.intp))
result = libalgos.rank_1d(arr)
arr[mask] = np.inf
exp = rankdata(arr)
exp[mask] = np.nan
Expand Down