diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a18cfc41d1e2e..22e2abc9b9c36 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -45,6 +45,7 @@ from numpy cimport ( cnp.import_array() cimport pandas._libs.util as util +from pandas._libs.dtypes cimport numeric_object_t from pandas._libs.khash cimport ( kh_destroy_int64, kh_get_int64, @@ -860,34 +861,30 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): # rank_1d, rank_2d # ---------------------------------------------------------------------- -ctypedef fused rank_t: - object - float64_t - uint64_t - int64_t - - -cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None): +cdef numeric_object_t get_rank_nan_fill_val( + bint rank_nans_highest, + numeric_object_t[:] _=None +): """ Return the value we'll use to represent missing values when sorting depending on if we'd like missing values to end up at the top/bottom. (The second parameter is unused, but needed for fused type specialization) """ if rank_nans_highest: - if rank_t is object: + if numeric_object_t is object: return Infinity() - elif rank_t is int64_t: + elif numeric_object_t is int64_t: return util.INT64_MAX - elif rank_t is uint64_t: + elif numeric_object_t is uint64_t: return util.UINT64_MAX else: return np.inf else: - if rank_t is object: + if numeric_object_t is object: return NegInfinity() - elif rank_t is int64_t: + elif numeric_object_t is int64_t: return NPY_NAT - elif rank_t is uint64_t: + elif numeric_object_t is uint64_t: return 0 else: return -np.inf @@ -896,7 +893,7 @@ cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None): @cython.wraparound(False) @cython.boundscheck(False) def rank_1d( - ndarray[rank_t, ndim=1] values, + ndarray[numeric_object_t, ndim=1] values, const intp_t[:] labels=None, bint is_datetimelike=False, ties_method="average", @@ -909,7 +906,7 @@ def rank_1d( Parameters ---------- - values : array of rank_t values to be ranked + values : array of numeric_object_t values to be ranked labels : np.ndarray[np.intp] or None Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. If not called @@ -939,11 +936,11 @@ def rank_1d( int64_t[::1] grp_sizes intp_t[:] lexsort_indexer float64_t[::1] out - ndarray[rank_t, ndim=1] masked_vals - rank_t[:] masked_vals_memview + ndarray[numeric_object_t, ndim=1] masked_vals + numeric_object_t[:] masked_vals_memview uint8_t[:] mask bint keep_na, nans_rank_highest, check_labels, check_mask - rank_t nan_fill_val + numeric_object_t nan_fill_val tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -964,21 +961,22 @@ def rank_1d( check_labels = labels is not None # For cases where a mask is not possible, we can avoid mask checks - check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) + check_mask = not (numeric_object_t is uint64_t or + (numeric_object_t is int64_t and not is_datetimelike)) # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array - if rank_t is object and values.dtype != np.object_: + if numeric_object_t is object and values.dtype != np.object_: masked_vals = values.astype('O') else: masked_vals = values.copy() - if rank_t is object: + if numeric_object_t is object: mask = missing.isnaobj(masked_vals) - elif rank_t is int64_t and is_datetimelike: + elif numeric_object_t is int64_t and is_datetimelike: mask = (masked_vals == NPY_NAT).astype(np.uint8) - elif rank_t is float64_t: + elif numeric_object_t is float64_t: mask = np.isnan(masked_vals).astype(np.uint8) else: mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) @@ -990,7 +988,7 @@ def rank_1d( # will flip the ordering to still end up with lowest rank. # Symmetric logic applies to `na_option == 'bottom'` nans_rank_highest = ascending ^ (na_option == 'top') - nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) + nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest) if nans_rank_highest: order = [masked_vals, mask] else: @@ -1037,7 +1035,7 @@ cdef void rank_sorted_1d( int64_t[::1] grp_sizes, const intp_t[:] sort_indexer, # Can make const with cython3 (https://github.com/cython/cython/issues/3222) - rank_t[:] masked_vals, + numeric_object_t[:] masked_vals, const uint8_t[:] mask, bint check_mask, Py_ssize_t N, @@ -1061,7 +1059,7 @@ cdef void rank_sorted_1d( if labels is None. sort_indexer : intp_t[:] Array of indices which sorts masked_vals - masked_vals : rank_t[:] + masked_vals : numeric_object_t[:] The values input to rank_1d, with missing values replaced by fill values mask : uint8_t[:] Array where entries are True if the value is missing, False otherwise. @@ -1093,7 +1091,7 @@ cdef void rank_sorted_1d( # that sorted value for retrieval back from the original # values / masked_vals arrays # TODO: de-duplicate once cython supports conditional nogil - if rank_t is object: + if numeric_object_t is object: with gil: for i in range(N): at_end = i == N - 1 @@ -1301,7 +1299,7 @@ cdef void rank_sorted_1d( def rank_2d( - ndarray[rank_t, ndim=2] in_arr, + ndarray[numeric_object_t, ndim=2] in_arr, int axis=0, bint is_datetimelike=False, ties_method="average", @@ -1316,13 +1314,13 @@ def rank_2d( Py_ssize_t k, n, col float64_t[::1, :] out # Column-major so columns are contiguous int64_t[::1] grp_sizes - ndarray[rank_t, ndim=2] values - rank_t[:, :] masked_vals + ndarray[numeric_object_t, ndim=2] values + numeric_object_t[:, :] masked_vals intp_t[:, :] sort_indexer uint8_t[:, :] mask TiebreakEnumType tiebreak bint check_mask, keep_na, nans_rank_highest - rank_t nan_fill_val + numeric_object_t nan_fill_val tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -1332,24 +1330,25 @@ def rank_2d( keep_na = na_option == 'keep' # For cases where a mask is not possible, we can avoid mask checks - check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) + check_mask = not (numeric_object_t is uint64_t or + (numeric_object_t is int64_t and not is_datetimelike)) if axis == 1: values = np.asarray(in_arr).T.copy() else: values = np.asarray(in_arr).copy() - if rank_t is object: + if numeric_object_t is object: if values.dtype != np.object_: values = values.astype('O') nans_rank_highest = ascending ^ (na_option == 'top') if check_mask: - nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) + nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest) - if rank_t is object: + if numeric_object_t is object: mask = missing.isnaobj2d(values).view(np.uint8) - elif rank_t is float64_t: + elif numeric_object_t is float64_t: mask = np.isnan(values).view(np.uint8) # int64 and datetimelike diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd new file mode 100644 index 0000000000000..ef95b8aab6e70 --- /dev/null +++ b/pandas/_libs/dtypes.pxd @@ -0,0 +1,17 @@ +""" +Common location for shared fused types +""" + +from numpy cimport ( + float32_t, + float64_t, + int64_t, + uint64_t, +) + +ctypedef fused numeric_object_t: + float64_t + float32_t + int64_t + uint64_t + object diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index b8700aa473d03..bbdc5a8287502 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -43,6 +43,7 @@ from pandas._libs.algos import ( take_2d_axis1_float64_float64, ) +from pandas._libs.dtypes cimport numeric_object_t from pandas._libs.missing cimport checknull @@ -921,23 +922,15 @@ def group_quantile(ndarray[float64_t, ndim=2] out, # group_nth, group_last, group_rank # ---------------------------------------------------------------------- -ctypedef fused rank_t: - float64_t - float32_t - int64_t - uint64_t - object - - -cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: - if rank_t is object: +cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil: + if numeric_object_t is object: # Should never be used, but we need to avoid the `val != val` below # or else cython will raise about gil acquisition. raise NotImplementedError - elif rank_t is int64_t: + elif numeric_object_t is int64_t: return is_datetimelike and val == NPY_NAT - elif rank_t is uint64_t: + elif numeric_object_t is uint64_t: # There is no NA value for uint64 return False else: @@ -945,12 +938,12 @@ cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: # GH#31710 use memorviews once cython 0.30 is released so we can -# use `const rank_t[:, :] values` +# use `const numeric_object_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) -def group_last(rank_t[:, ::1] out, +def group_last(numeric_object_t[:, ::1] out, int64_t[::1] counts, - ndarray[rank_t, ndim=2] values, + ndarray[numeric_object_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1) -> None: """ @@ -958,8 +951,8 @@ def group_last(rank_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx + numeric_object_t val + ndarray[numeric_object_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs bint runtime_error = False @@ -970,14 +963,14 @@ def group_last(rank_t[:, ::1] out, min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: + if numeric_object_t is object: resx = np.empty((out).shape, dtype=object) else: resx = np.empty_like(out) N, K = (values).shape - if rank_t is object: + if numeric_object_t is object: # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] @@ -1019,9 +1012,9 @@ def group_last(rank_t[:, ::1] out, for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - if rank_t is int64_t: + if numeric_object_t is int64_t: out[i, j] = NPY_NAT - elif rank_t is uint64_t: + elif numeric_object_t is uint64_t: runtime_error = True break else: @@ -1037,12 +1030,12 @@ def group_last(rank_t[:, ::1] out, # GH#31710 use memorviews once cython 0.30 is released so we can -# use `const rank_t[:, :] values` +# use `const numeric_object_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) -def group_nth(rank_t[:, ::1] out, +def group_nth(numeric_object_t[:, ::1] out, int64_t[::1] counts, - ndarray[rank_t, ndim=2] values, + ndarray[numeric_object_t, ndim=2] values, const intp_t[::1] labels, int64_t min_count=-1, int64_t rank=1, @@ -1052,8 +1045,8 @@ def group_nth(rank_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx + numeric_object_t val + ndarray[numeric_object_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs bint runtime_error = False @@ -1064,14 +1057,14 @@ def group_nth(rank_t[:, ::1] out, min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: + if numeric_object_t is object: resx = np.empty((out).shape, dtype=object) else: resx = np.empty_like(out) N, K = (values).shape - if rank_t is object: + if numeric_object_t is object: # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] @@ -1116,9 +1109,9 @@ def group_nth(rank_t[:, ::1] out, for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - if rank_t is int64_t: + if numeric_object_t is int64_t: out[i, j] = NPY_NAT - elif rank_t is uint64_t: + elif numeric_object_t is uint64_t: runtime_error = True break else: @@ -1135,7 +1128,7 @@ def group_nth(rank_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) def group_rank(float64_t[:, ::1] out, - ndarray[rank_t, ndim=2] values, + ndarray[numeric_object_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, str ties_method="average", @@ -1147,7 +1140,7 @@ def group_rank(float64_t[:, ::1] out, ---------- out : np.ndarray[np.float64, ndim=2] Values to which this method will write its results. - values : np.ndarray of rank_t values to be ranked + values : np.ndarray of numeric_object_t values to be ranked labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`