Skip to content

CLN: share rank_t fused type #43789

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 36 additions & 37 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ from numpy cimport (
cnp.import_array()

cimport pandas._libs.util as util
from pandas._libs.dtypes cimport numeric_object_t
from pandas._libs.khash cimport (
kh_destroy_int64,
kh_get_int64,
Expand Down Expand Up @@ -860,34 +861,30 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
# rank_1d, rank_2d
# ----------------------------------------------------------------------

ctypedef fused rank_t:
object
float64_t
uint64_t
int64_t


cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
cdef numeric_object_t get_rank_nan_fill_val(
bint rank_nans_highest,
numeric_object_t[:] _=None
):
"""
Return the value we'll use to represent missing values when sorting depending
on if we'd like missing values to end up at the top/bottom. (The second parameter
is unused, but needed for fused type specialization)
"""
if rank_nans_highest:
if rank_t is object:
if numeric_object_t is object:
return Infinity()
elif rank_t is int64_t:
elif numeric_object_t is int64_t:
return util.INT64_MAX
elif rank_t is uint64_t:
elif numeric_object_t is uint64_t:
return util.UINT64_MAX
else:
return np.inf
else:
if rank_t is object:
if numeric_object_t is object:
return NegInfinity()
elif rank_t is int64_t:
elif numeric_object_t is int64_t:
return NPY_NAT
elif rank_t is uint64_t:
elif numeric_object_t is uint64_t:
return 0
else:
return -np.inf
Expand All @@ -896,7 +893,7 @@ cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
@cython.wraparound(False)
@cython.boundscheck(False)
def rank_1d(
ndarray[rank_t, ndim=1] values,
ndarray[numeric_object_t, ndim=1] values,
const intp_t[:] labels=None,
bint is_datetimelike=False,
ties_method="average",
Expand All @@ -909,7 +906,7 @@ def rank_1d(

Parameters
----------
values : array of rank_t values to be ranked
values : array of numeric_object_t values to be ranked
labels : np.ndarray[np.intp] or None
Array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`. If not called
Expand Down Expand Up @@ -939,11 +936,11 @@ def rank_1d(
int64_t[::1] grp_sizes
intp_t[:] lexsort_indexer
float64_t[::1] out
ndarray[rank_t, ndim=1] masked_vals
rank_t[:] masked_vals_memview
ndarray[numeric_object_t, ndim=1] masked_vals
numeric_object_t[:] masked_vals_memview
uint8_t[:] mask
bint keep_na, nans_rank_highest, check_labels, check_mask
rank_t nan_fill_val
numeric_object_t nan_fill_val

tiebreak = tiebreakers[ties_method]
if tiebreak == TIEBREAK_FIRST:
Expand All @@ -964,21 +961,22 @@ def rank_1d(
check_labels = labels is not None

# For cases where a mask is not possible, we can avoid mask checks
check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
check_mask = not (numeric_object_t is uint64_t or
(numeric_object_t is int64_t and not is_datetimelike))

# Copy values into new array in order to fill missing data
# with mask, without obfuscating location of missing data
# in values array
if rank_t is object and values.dtype != np.object_:
if numeric_object_t is object and values.dtype != np.object_:
masked_vals = values.astype('O')
else:
masked_vals = values.copy()

if rank_t is object:
if numeric_object_t is object:
mask = missing.isnaobj(masked_vals)
elif rank_t is int64_t and is_datetimelike:
elif numeric_object_t is int64_t and is_datetimelike:
mask = (masked_vals == NPY_NAT).astype(np.uint8)
elif rank_t is float64_t:
elif numeric_object_t is float64_t:
mask = np.isnan(masked_vals).astype(np.uint8)
else:
mask = np.zeros(shape=len(masked_vals), dtype=np.uint8)
Expand All @@ -990,7 +988,7 @@ def rank_1d(
# will flip the ordering to still end up with lowest rank.
# Symmetric logic applies to `na_option == 'bottom'`
nans_rank_highest = ascending ^ (na_option == 'top')
nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest)
if nans_rank_highest:
order = [masked_vals, mask]
else:
Expand Down Expand Up @@ -1037,7 +1035,7 @@ cdef void rank_sorted_1d(
int64_t[::1] grp_sizes,
const intp_t[:] sort_indexer,
# Can make const with cython3 (https://github.com/cython/cython/issues/3222)
rank_t[:] masked_vals,
numeric_object_t[:] masked_vals,
const uint8_t[:] mask,
bint check_mask,
Py_ssize_t N,
Expand All @@ -1061,7 +1059,7 @@ cdef void rank_sorted_1d(
if labels is None.
sort_indexer : intp_t[:]
Array of indices which sorts masked_vals
masked_vals : rank_t[:]
masked_vals : numeric_object_t[:]
The values input to rank_1d, with missing values replaced by fill values
mask : uint8_t[:]
Array where entries are True if the value is missing, False otherwise.
Expand Down Expand Up @@ -1093,7 +1091,7 @@ cdef void rank_sorted_1d(
# that sorted value for retrieval back from the original
# values / masked_vals arrays
# TODO: de-duplicate once cython supports conditional nogil
if rank_t is object:
if numeric_object_t is object:
with gil:
for i in range(N):
at_end = i == N - 1
Expand Down Expand Up @@ -1301,7 +1299,7 @@ cdef void rank_sorted_1d(


def rank_2d(
ndarray[rank_t, ndim=2] in_arr,
ndarray[numeric_object_t, ndim=2] in_arr,
int axis=0,
bint is_datetimelike=False,
ties_method="average",
Expand All @@ -1316,13 +1314,13 @@ def rank_2d(
Py_ssize_t k, n, col
float64_t[::1, :] out # Column-major so columns are contiguous
int64_t[::1] grp_sizes
ndarray[rank_t, ndim=2] values
rank_t[:, :] masked_vals
ndarray[numeric_object_t, ndim=2] values
numeric_object_t[:, :] masked_vals
intp_t[:, :] sort_indexer
uint8_t[:, :] mask
TiebreakEnumType tiebreak
bint check_mask, keep_na, nans_rank_highest
rank_t nan_fill_val
numeric_object_t nan_fill_val

tiebreak = tiebreakers[ties_method]
if tiebreak == TIEBREAK_FIRST:
Expand All @@ -1332,24 +1330,25 @@ def rank_2d(
keep_na = na_option == 'keep'

# For cases where a mask is not possible, we can avoid mask checks
check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
check_mask = not (numeric_object_t is uint64_t or
(numeric_object_t is int64_t and not is_datetimelike))

if axis == 1:
values = np.asarray(in_arr).T.copy()
else:
values = np.asarray(in_arr).copy()

if rank_t is object:
if numeric_object_t is object:
if values.dtype != np.object_:
values = values.astype('O')

nans_rank_highest = ascending ^ (na_option == 'top')
if check_mask:
nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest)

if rank_t is object:
if numeric_object_t is object:
mask = missing.isnaobj2d(values).view(np.uint8)
elif rank_t is float64_t:
elif numeric_object_t is float64_t:
mask = np.isnan(values).view(np.uint8)

# int64 and datetimelike
Expand Down
17 changes: 17 additions & 0 deletions pandas/_libs/dtypes.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""
Common location for shared fused types
"""

from numpy cimport (
float32_t,
float64_t,
int64_t,
uint64_t,
)

ctypedef fused numeric_object_t:
float64_t
float32_t
int64_t
uint64_t
object
57 changes: 25 additions & 32 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ from pandas._libs.algos import (
take_2d_axis1_float64_float64,
)

from pandas._libs.dtypes cimport numeric_object_t
from pandas._libs.missing cimport checknull


Expand Down Expand Up @@ -921,45 +922,37 @@ def group_quantile(ndarray[float64_t, ndim=2] out,
# group_nth, group_last, group_rank
# ----------------------------------------------------------------------

ctypedef fused rank_t:
float64_t
float32_t
int64_t
uint64_t
object


cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil:
if rank_t is object:
cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil:
if numeric_object_t is object:
# Should never be used, but we need to avoid the `val != val` below
# or else cython will raise about gil acquisition.
raise NotImplementedError

elif rank_t is int64_t:
elif numeric_object_t is int64_t:
return is_datetimelike and val == NPY_NAT
elif rank_t is uint64_t:
elif numeric_object_t is uint64_t:
# There is no NA value for uint64
return False
else:
return val != val


# GH#31710 use memorviews once cython 0.30 is released so we can
# use `const rank_t[:, :] values`
# use `const numeric_object_t[:, :] values`
@cython.wraparound(False)
@cython.boundscheck(False)
def group_last(rank_t[:, ::1] out,
def group_last(numeric_object_t[:, ::1] out,
int64_t[::1] counts,
ndarray[rank_t, ndim=2] values,
ndarray[numeric_object_t, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=-1) -> None:
"""
Only aggregates on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
rank_t val
ndarray[rank_t, ndim=2] resx
numeric_object_t val
ndarray[numeric_object_t, ndim=2] resx
ndarray[int64_t, ndim=2] nobs
bint runtime_error = False

Expand All @@ -970,14 +963,14 @@ def group_last(rank_t[:, ::1] out,

min_count = max(min_count, 1)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
if rank_t is object:
if numeric_object_t is object:
resx = np.empty((<object>out).shape, dtype=object)
else:
resx = np.empty_like(out)

N, K = (<object>values).shape

if rank_t is object:
if numeric_object_t is object:
# TODO: De-duplicate once conditional-nogil is available
for i in range(N):
lab = labels[i]
Expand Down Expand Up @@ -1019,9 +1012,9 @@ def group_last(rank_t[:, ::1] out,
for i in range(ncounts):
for j in range(K):
if nobs[i, j] < min_count:
if rank_t is int64_t:
if numeric_object_t is int64_t:
out[i, j] = NPY_NAT
elif rank_t is uint64_t:
elif numeric_object_t is uint64_t:
runtime_error = True
break
else:
Expand All @@ -1037,12 +1030,12 @@ def group_last(rank_t[:, ::1] out,


# GH#31710 use memorviews once cython 0.30 is released so we can
# use `const rank_t[:, :] values`
# use `const numeric_object_t[:, :] values`
@cython.wraparound(False)
@cython.boundscheck(False)
def group_nth(rank_t[:, ::1] out,
def group_nth(numeric_object_t[:, ::1] out,
int64_t[::1] counts,
ndarray[rank_t, ndim=2] values,
ndarray[numeric_object_t, ndim=2] values,
const intp_t[::1] labels,
int64_t min_count=-1,
int64_t rank=1,
Expand All @@ -1052,8 +1045,8 @@ def group_nth(rank_t[:, ::1] out,
"""
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
rank_t val
ndarray[rank_t, ndim=2] resx
numeric_object_t val
ndarray[numeric_object_t, ndim=2] resx
ndarray[int64_t, ndim=2] nobs
bint runtime_error = False

Expand All @@ -1064,14 +1057,14 @@ def group_nth(rank_t[:, ::1] out,

min_count = max(min_count, 1)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
if rank_t is object:
if numeric_object_t is object:
resx = np.empty((<object>out).shape, dtype=object)
else:
resx = np.empty_like(out)

N, K = (<object>values).shape

if rank_t is object:
if numeric_object_t is object:
# TODO: De-duplicate once conditional-nogil is available
for i in range(N):
lab = labels[i]
Expand Down Expand Up @@ -1116,9 +1109,9 @@ def group_nth(rank_t[:, ::1] out,
for i in range(ncounts):
for j in range(K):
if nobs[i, j] < min_count:
if rank_t is int64_t:
if numeric_object_t is int64_t:
out[i, j] = NPY_NAT
elif rank_t is uint64_t:
elif numeric_object_t is uint64_t:
runtime_error = True
break
else:
Expand All @@ -1135,7 +1128,7 @@ def group_nth(rank_t[:, ::1] out,
@cython.boundscheck(False)
@cython.wraparound(False)
def group_rank(float64_t[:, ::1] out,
ndarray[rank_t, ndim=2] values,
ndarray[numeric_object_t, ndim=2] values,
const intp_t[::1] labels,
int ngroups,
bint is_datetimelike, str ties_method="average",
Expand All @@ -1147,7 +1140,7 @@ def group_rank(float64_t[:, ::1] out,
----------
out : np.ndarray[np.float64, ndim=2]
Values to which this method will write its results.
values : np.ndarray of rank_t values to be ranked
values : np.ndarray of numeric_object_t values to be ranked
labels : np.ndarray[np.intp]
Array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`
Expand Down