CLN: share rank_t fused type (#43789)

mzeitlin11 · web-flow · commit 56a5f5c4588b · 2021-09-29T14:03:01.000-04:00
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -45,6 +45,7 @@ from numpy cimport (
 cnp.import_array()
 
 cimport pandas._libs.util as util
+from pandas._libs.dtypes cimport numeric_object_t
 from pandas._libs.khash cimport (
     kh_destroy_int64,
     kh_get_int64,
@@ -860,34 +861,30 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
 # rank_1d, rank_2d
 # ----------------------------------------------------------------------
 
-ctypedef fused rank_t:
-    object
-    float64_t
-    uint64_t
-    int64_t
-
-
-cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
+cdef numeric_object_t get_rank_nan_fill_val(
+        bint rank_nans_highest,
+        numeric_object_t[:] _=None
+):
     """
     Return the value we'll use to represent missing values when sorting depending
     on if we'd like missing values to end up at the top/bottom. (The second parameter
     is unused, but needed for fused type specialization)
     """
     if rank_nans_highest:
-        if rank_t is object:
+        if numeric_object_t is object:
             return Infinity()
-        elif rank_t is int64_t:
+        elif numeric_object_t is int64_t:
             return util.INT64_MAX
-        elif rank_t is uint64_t:
+        elif numeric_object_t is uint64_t:
             return util.UINT64_MAX
         else:
             return np.inf
     else:
-        if rank_t is object:
+        if numeric_object_t is object:
             return NegInfinity()
-        elif rank_t is int64_t:
+        elif numeric_object_t is int64_t:
             return NPY_NAT
-        elif rank_t is uint64_t:
+        elif numeric_object_t is uint64_t:
             return 0
         else:
             return -np.inf
@@ -896,7 +893,7 @@ cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def rank_1d(
-    ndarray[rank_t, ndim=1] values,
+    ndarray[numeric_object_t, ndim=1] values,
     const intp_t[:] labels=None,
     bint is_datetimelike=False,
     ties_method="average",
@@ -909,7 +906,7 @@ def rank_1d(
 
     Parameters
     ----------
-    values : array of rank_t values to be ranked
+    values : array of numeric_object_t values to be ranked
     labels : np.ndarray[np.intp] or None
         Array containing unique label for each group, with its ordering
         matching up to the corresponding record in `values`. If not called
@@ -939,11 +936,11 @@ def rank_1d(
         int64_t[::1] grp_sizes
         intp_t[:] lexsort_indexer
         float64_t[::1] out
-        ndarray[rank_t, ndim=1] masked_vals
-        rank_t[:] masked_vals_memview
+        ndarray[numeric_object_t, ndim=1] masked_vals
+        numeric_object_t[:] masked_vals_memview
         uint8_t[:] mask
         bint keep_na, nans_rank_highest, check_labels, check_mask
-        rank_t nan_fill_val
+        numeric_object_t nan_fill_val
 
     tiebreak = tiebreakers[ties_method]
     if tiebreak == TIEBREAK_FIRST:
@@ -964,21 +961,22 @@ def rank_1d(
     check_labels = labels is not None
 
     # For cases where a mask is not possible, we can avoid mask checks
-    check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
+    check_mask = not (numeric_object_t is uint64_t or
+                      (numeric_object_t is int64_t and not is_datetimelike))
 
     # Copy values into new array in order to fill missing data
     # with mask, without obfuscating location of missing data
     # in values array
-    if rank_t is object and values.dtype != np.object_:
+    if numeric_object_t is object and values.dtype != np.object_:
         masked_vals = values.astype('O')
     else:
         masked_vals = values.copy()
 
-    if rank_t is object:
+    if numeric_object_t is object:
         mask = missing.isnaobj(masked_vals)
-    elif rank_t is int64_t and is_datetimelike:
+    elif numeric_object_t is int64_t and is_datetimelike:
         mask = (masked_vals == NPY_NAT).astype(np.uint8)
-    elif rank_t is float64_t:
+    elif numeric_object_t is float64_t:
         mask = np.isnan(masked_vals).astype(np.uint8)
     else:
         mask = np.zeros(shape=len(masked_vals), dtype=np.uint8)
@@ -990,7 +988,7 @@ def rank_1d(
     # will flip the ordering to still end up with lowest rank.
     # Symmetric logic applies to `na_option == 'bottom'`
     nans_rank_highest = ascending ^ (na_option == 'top')
-    nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
+    nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest)
     if nans_rank_highest:
         order = [masked_vals, mask]
     else:
@@ -1037,7 +1035,7 @@ cdef void rank_sorted_1d(
     int64_t[::1] grp_sizes,
     const intp_t[:] sort_indexer,
     # Can make const with cython3 (https://github.com/cython/cython/issues/3222)
-    rank_t[:] masked_vals,
+    numeric_object_t[:] masked_vals,
     const uint8_t[:] mask,
     bint check_mask,
     Py_ssize_t N,
@@ -1061,7 +1059,7 @@ cdef void rank_sorted_1d(
         if labels is None.
     sort_indexer : intp_t[:]
         Array of indices which sorts masked_vals
-    masked_vals : rank_t[:]
+    masked_vals : numeric_object_t[:]
         The values input to rank_1d, with missing values replaced by fill values
     mask : uint8_t[:]
         Array where entries are True if the value is missing, False otherwise.
@@ -1093,7 +1091,7 @@ cdef void rank_sorted_1d(
     # that sorted value for retrieval back from the original
     # values / masked_vals arrays
     # TODO: de-duplicate once cython supports conditional nogil
-    if rank_t is object:
+    if numeric_object_t is object:
         with gil:
             for i in range(N):
                 at_end = i == N - 1
@@ -1301,7 +1299,7 @@ cdef void rank_sorted_1d(
 
 
 def rank_2d(
-    ndarray[rank_t, ndim=2] in_arr,
+    ndarray[numeric_object_t, ndim=2] in_arr,
     int axis=0,
     bint is_datetimelike=False,
     ties_method="average",
@@ -1316,13 +1314,13 @@ def rank_2d(
         Py_ssize_t k, n, col
         float64_t[::1, :] out  # Column-major so columns are contiguous
         int64_t[::1] grp_sizes
-        ndarray[rank_t, ndim=2] values
-        rank_t[:, :] masked_vals
+        ndarray[numeric_object_t, ndim=2] values
+        numeric_object_t[:, :] masked_vals
         intp_t[:, :] sort_indexer
         uint8_t[:, :] mask
         TiebreakEnumType tiebreak
         bint check_mask, keep_na, nans_rank_highest
-        rank_t nan_fill_val
+        numeric_object_t nan_fill_val
 
     tiebreak = tiebreakers[ties_method]
     if tiebreak == TIEBREAK_FIRST:
@@ -1332,24 +1330,25 @@ def rank_2d(
     keep_na = na_option == 'keep'
 
     # For cases where a mask is not possible, we can avoid mask checks
-    check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
+    check_mask = not (numeric_object_t is uint64_t or
+                      (numeric_object_t is int64_t and not is_datetimelike))
 
     if axis == 1:
         values = np.asarray(in_arr).T.copy()
     else:
         values = np.asarray(in_arr).copy()
 
-    if rank_t is object:
+    if numeric_object_t is object:
         if values.dtype != np.object_:
             values = values.astype('O')
 
     nans_rank_highest = ascending ^ (na_option == 'top')
     if check_mask:
-        nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
+        nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest)
 
-        if rank_t is object:
+        if numeric_object_t is object:
             mask = missing.isnaobj2d(values).view(np.uint8)
-        elif rank_t is float64_t:
+        elif numeric_object_t is float64_t:
             mask = np.isnan(values).view(np.uint8)
 
         # int64 and datetimelike
diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd
@@ -0,0 +1,17 @@
+"""
+Common location for shared fused types
+"""
+
+from numpy cimport (
+    float32_t,
+    float64_t,
+    int64_t,
+    uint64_t,
+)
+
+ctypedef fused numeric_object_t:
+    float64_t
+    float32_t
+    int64_t
+    uint64_t
+    object
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -43,6 +43,7 @@ from pandas._libs.algos import (
     take_2d_axis1_float64_float64,
 )
 
+from pandas._libs.dtypes cimport numeric_object_t
 from pandas._libs.missing cimport checknull
 
 
@@ -921,45 +922,37 @@ def group_quantile(ndarray[float64_t, ndim=2] out,
 # group_nth, group_last, group_rank
 # ----------------------------------------------------------------------
 
-ctypedef fused rank_t:
-    float64_t
-    float32_t
-    int64_t
-    uint64_t
-    object
-
-
-cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil:
-    if rank_t is object:
+cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil:
+    if numeric_object_t is object:
         # Should never be used, but we need to avoid the `val != val` below
         #  or else cython will raise about gil acquisition.
         raise NotImplementedError
 
-    elif rank_t is int64_t:
+    elif numeric_object_t is int64_t:
         return is_datetimelike and val == NPY_NAT
-    elif rank_t is uint64_t:
+    elif numeric_object_t is uint64_t:
         # There is no NA value for uint64
         return False
     else:
         return val != val
 
 
 # GH#31710 use memorviews once cython 0.30 is released so we can
-#  use `const rank_t[:, :] values`
+#  use `const numeric_object_t[:, :] values`
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_last(rank_t[:, ::1] out,
+def group_last(numeric_object_t[:, ::1] out,
                int64_t[::1] counts,
-               ndarray[rank_t, ndim=2] values,
+               ndarray[numeric_object_t, ndim=2] values,
                const intp_t[::1] labels,
                Py_ssize_t min_count=-1) -> None:
     """
     Only aggregates on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        rank_t val
-        ndarray[rank_t, ndim=2] resx
+        numeric_object_t val
+        ndarray[numeric_object_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
         bint runtime_error = False
 
@@ -970,14 +963,14 @@ def group_last(rank_t[:, ::1] out,
 
     min_count = max(min_count, 1)
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
-    if rank_t is object:
+    if numeric_object_t is object:
         resx = np.empty((<object>out).shape, dtype=object)
     else:
         resx = np.empty_like(out)
 
     N, K = (<object>values).shape
 
-    if rank_t is object:
+    if numeric_object_t is object:
         # TODO: De-duplicate once conditional-nogil is available
         for i in range(N):
             lab = labels[i]
@@ -1019,9 +1012,9 @@ def group_last(rank_t[:, ::1] out,
             for i in range(ncounts):
                 for j in range(K):
                     if nobs[i, j] < min_count:
-                        if rank_t is int64_t:
+                        if numeric_object_t is int64_t:
                             out[i, j] = NPY_NAT
-                        elif rank_t is uint64_t:
+                        elif numeric_object_t is uint64_t:
                             runtime_error = True
                             break
                         else:
@@ -1037,12 +1030,12 @@ def group_last(rank_t[:, ::1] out,
 
 
 # GH#31710 use memorviews once cython 0.30 is released so we can
-#  use `const rank_t[:, :] values`
+#  use `const numeric_object_t[:, :] values`
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_nth(rank_t[:, ::1] out,
+def group_nth(numeric_object_t[:, ::1] out,
               int64_t[::1] counts,
-              ndarray[rank_t, ndim=2] values,
+              ndarray[numeric_object_t, ndim=2] values,
               const intp_t[::1] labels,
               int64_t min_count=-1,
               int64_t rank=1,
@@ -1052,8 +1045,8 @@ def group_nth(rank_t[:, ::1] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        rank_t val
-        ndarray[rank_t, ndim=2] resx
+        numeric_object_t val
+        ndarray[numeric_object_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
         bint runtime_error = False
 
@@ -1064,14 +1057,14 @@ def group_nth(rank_t[:, ::1] out,
 
     min_count = max(min_count, 1)
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
-    if rank_t is object:
+    if numeric_object_t is object:
         resx = np.empty((<object>out).shape, dtype=object)
     else:
         resx = np.empty_like(out)
 
     N, K = (<object>values).shape
 
-    if rank_t is object:
+    if numeric_object_t is object:
         # TODO: De-duplicate once conditional-nogil is available
         for i in range(N):
             lab = labels[i]
@@ -1116,9 +1109,9 @@ def group_nth(rank_t[:, ::1] out,
             for i in range(ncounts):
                 for j in range(K):
                     if nobs[i, j] < min_count:
-                        if rank_t is int64_t:
+                        if numeric_object_t is int64_t:
                             out[i, j] = NPY_NAT
-                        elif rank_t is uint64_t:
+                        elif numeric_object_t is uint64_t:
                             runtime_error = True
                             break
                         else:
@@ -1135,7 +1128,7 @@ def group_nth(rank_t[:, ::1] out,
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_rank(float64_t[:, ::1] out,
-               ndarray[rank_t, ndim=2] values,
+               ndarray[numeric_object_t, ndim=2] values,
                const intp_t[::1] labels,
                int ngroups,
                bint is_datetimelike, str ties_method="average",
@@ -1147,7 +1140,7 @@ def group_rank(float64_t[:, ::1] out,
     ----------
     out : np.ndarray[np.float64, ndim=2]
         Values to which this method will write its results.
-    values : np.ndarray of rank_t values to be ranked
+    values : np.ndarray of numeric_object_t values to be ranked
     labels : np.ndarray[np.intp]
         Array containing unique label for each group, with its ordering
         matching up to the corresponding record in `values`