From c7a91ac3fb89638285e2b524c849197c37375f6b Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 16:00:40 -0400
Subject: [PATCH 01/25] REF: split out sorted_rank algo

---
 pandas/_libs/algos.pyx | 335 +++++++++++++++++++++++++----------------
 1 file changed, 203 insertions(+), 132 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index f2efeedb80d4d..362df3bf7e710 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -950,16 +950,15 @@ def rank_1d(
     """
     cdef:
         TiebreakEnumType tiebreak
-        Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0
-        Py_ssize_t grp_vals_seen=1, grp_na_count=0
-        ndarray[int64_t, ndim=1] grp_sizes
-        ndarray[intp_t, ndim=1] lexsort_indexer
-        ndarray[float64_t, ndim=1] out
-        ndarray[rank_t, ndim=1] masked_vals
-        ndarray[uint8_t, ndim=1] mask
-        bint keep_na, at_end, next_val_diff, check_labels, group_changed
+        Py_ssize_t N
+        int64_t[::1] grp_sizes
+        intp_t[:] lexsort_indexer
+        float64_t[::1] out
+        ndarray [rank_t, ndim=1] masked_vals
+        rank_t[:] masked_vals_memview
+        uint8_t[:] mask
+        bint keep_na, check_labels
         rank_t nan_fill_val
-        int64_t grp_size
 
     tiebreak = tiebreakers[ties_method]
     if tiebreak == TIEBREAK_FIRST:
@@ -978,6 +977,9 @@ def rank_1d(
     # comparisons
     check_labels = np.any(labels)
 
+    # For cases where a mask is not possible, we can avoid mask checks
+    check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
+
     # Copy values into new array in order to fill missing data
     # with mask, without obfuscating location of missing data
     # in values array
@@ -1021,9 +1023,11 @@ def rank_1d(
         else:
             nan_fill_val = -np.inf
 
-        order = (masked_vals, ~mask, labels)
+        order = (masked_vals, ~(np.array(mask, copy=False)), labels)
 
     np.putmask(masked_vals, mask, nan_fill_val)
+    # putmask doesn't accept a memoryview, so we assign as a separate step
+    masked_vals_memview = masked_vals
 
     # lexsort using labels, then mask, then actual values
     # each label corresponds to a different group value,
@@ -1034,6 +1038,77 @@ def rank_1d(
     if not ascending:
         lexsort_indexer = lexsort_indexer[::-1]
 
+    with nogil:
+        rank_sorted_1d(
+            out,
+            grp_sizes,
+            labels,
+            lexsort_indexer,
+            masked_vals_memview,
+            mask,
+            tiebreak,
+            check_mask,
+            check_labels,
+            keep_na,
+            N,
+        )
+        if pct:
+            for i in range(N):
+                if grp_sizes[i] != 0:
+                    out[i] = out[i] / grp_sizes[i]
+
+    return np.array(out)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef void rank_sorted_1d(
+    float64_t[::1] out,
+    int64_t[::1] grp_sizes,
+    const intp_t[:] labels,
+    const intp_t[:] sort_indexer,
+    # Can make const with cython3 (https://github.com/cython/cython/issues/3222)
+    rank_t[:] masked_vals,
+    const uint8_t[:] mask,
+    TiebreakEnumType tiebreak,
+    bint check_mask,
+    bint check_labels,
+    bint keep_na,
+    Py_ssize_t N,
+) nogil:
+    """
+    See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should
+    be handled in the caller. Note that `out` and `grp_sizes` are modified inplace.
+
+    Parameters
+    ----------
+    out : float64_t[::1]
+        Array to store computed ranks
+    grp_sizes : int64_t[::1]
+        Array to store group counts.
+    labels : See rank_1d.__doc__
+    sort_indexer : intp_t[:]
+    masked_vals : rank_t[:]
+    mask : uint8_t[:]
+    tiebreak : TiebreakEnumType
+        See rank_1d.__doc__ for the different modes
+    check_mask : bint
+        If False, assumes the mask is all False to skip mask indexing
+    check_labels : bint
+        If False, assumes all labels are the same to skip group handling logic
+    keep_na : bint
+        Whether or not to keep nulls
+    N : Py_ssize_t
+        The number of elements to rank. Note: it is not always true that
+        N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
+    """
+
+    cdef:
+        Py_ssize_t i, j, dups=0, sum_ranks=0,
+        Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0
+        bint at_end, next_val_diff, group_changed
+        int64_t grp_size
+
     # Loop over the length of the value array
     # each incremental i value can be looked up in the lexsort_indexer
     # array that we sorted previously, which gives us the location of
@@ -1041,105 +1116,7 @@ def rank_1d(
     # values / masked_vals arrays
     # TODO: de-duplicate once cython supports conditional nogil
     if rank_t is object:
-        for i in range(N):
-            at_end = i == N - 1
-
-            # dups and sum_ranks will be incremented each loop where
-            # the value / group remains the same, and should be reset
-            # when either of those change. Used to calculate tiebreakers
-            dups += 1
-            sum_ranks += i - grp_start + 1
-
-            next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]],
-                                               masked_vals[lexsort_indexer[i+1]])
-
-            # We'll need this check later anyway to determine group size, so just
-            # compute it here since shortcircuiting won't help
-            group_changed = at_end or (check_labels and
-                                       (labels[lexsort_indexer[i]]
-                                        != labels[lexsort_indexer[i+1]]))
-
-            # Update out only when there is a transition of values or labels.
-            # When a new value or group is encountered, go back #dups steps(
-            # the number of occurrence of current value) and assign the ranks
-            # based on the starting index of the current group (grp_start)
-            # and the current index
-            if (next_val_diff or group_changed
-                    or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])):
-
-                # If keep_na, check for missing values and assign back
-                # to the result where appropriate
-                if keep_na and mask[lexsort_indexer[i]]:
-                    grp_na_count = dups
-                    for j in range(i - dups + 1, i + 1):
-                        out[lexsort_indexer[j]] = NaN
-                elif tiebreak == TIEBREAK_AVERAGE:
-                    for j in range(i - dups + 1, i + 1):
-                        out[lexsort_indexer[j]] = sum_ranks / <float64_t>dups
-                elif tiebreak == TIEBREAK_MIN:
-                    for j in range(i - dups + 1, i + 1):
-                        out[lexsort_indexer[j]] = i - grp_start - dups + 2
-                elif tiebreak == TIEBREAK_MAX:
-                    for j in range(i - dups + 1, i + 1):
-                        out[lexsort_indexer[j]] = i - grp_start + 1
-
-                # With n as the previous rank in the group and m as the number
-                # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
-                # then rankings should be n + 1, n + 2 ... n + m
-                elif tiebreak == TIEBREAK_FIRST:
-                    for j in range(i - dups + 1, i + 1):
-                        out[lexsort_indexer[j]] = j + 1 - grp_start
-
-                # If TIEBREAK_FIRST and descending, the ranking should be
-                # n + m, n + (m - 1) ... n + 1. This is equivalent to
-                # (i - dups + 1) + (i - j + 1) - grp_start
-                elif tiebreak == TIEBREAK_FIRST_DESCENDING:
-                    for j in range(i - dups + 1, i + 1):
-                        out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
-                elif tiebreak == TIEBREAK_DENSE:
-                    for j in range(i - dups + 1, i + 1):
-                        out[lexsort_indexer[j]] = grp_vals_seen
-
-                # Look forward to the next value (using the sorting in
-                # lexsort_indexer). If the value does not equal the current
-                # value then we need to reset the dups and sum_ranks, knowing
-                # that a new value is coming up. The conditional also needs
-                # to handle nan equality and the end of iteration. If group
-                # changes we do not record seeing a new value in the group
-                if not group_changed and (next_val_diff or
-                                          (mask[lexsort_indexer[i]]
-                                           ^ mask[lexsort_indexer[i+1]])):
-                    dups = sum_ranks = 0
-                    grp_vals_seen += 1
-
-                # Similar to the previous conditional, check now if we are
-                # moving to a new group. If so, keep track of the index where
-                # the new group occurs, so the tiebreaker calculations can
-                # decrement that from their position. Fill in the size of each
-                # group encountered (used by pct calculations later). Also be
-                # sure to reset any of the items helping to calculate dups
-                if group_changed:
-
-                    # If not dense tiebreak, group size used to compute
-                    # percentile will be # of non-null elements in group
-                    if tiebreak != TIEBREAK_DENSE:
-                        grp_size = i - grp_start + 1 - grp_na_count
-
-                    # Otherwise, it will be the number of distinct values
-                    # in the group, subtracting 1 if NaNs are present
-                    # since that is a distinct value we shouldn't count
-                    else:
-                        grp_size = grp_vals_seen - (grp_na_count > 0)
-
-                    for j in range(grp_start, i + 1):
-                        grp_sizes[lexsort_indexer[j]] = grp_size
-
-                    dups = sum_ranks = 0
-                    grp_na_count = 0
-                    grp_start = i + 1
-                    grp_vals_seen = 1
-    else:
-        with nogil:
+        with gil:
             for i in range(N):
                 at_end = i == N - 1
 
@@ -1149,55 +1126,56 @@ def rank_1d(
                 dups += 1
                 sum_ranks += i - grp_start + 1
 
-                next_val_diff = at_end or (masked_vals[lexsort_indexer[i]]
-                                           != masked_vals[lexsort_indexer[i+1]])
+                next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
+                                                   masked_vals[sort_indexer[i+1]])
 
                 # We'll need this check later anyway to determine group size, so just
                 # compute it here since shortcircuiting won't help
                 group_changed = at_end or (check_labels and
-                                           (labels[lexsort_indexer[i]]
-                                            != labels[lexsort_indexer[i+1]]))
+                                           (labels[sort_indexer[i]]
+                                            != labels[sort_indexer[i+1]]))
 
                 # Update out only when there is a transition of values or labels.
                 # When a new value or group is encountered, go back #dups steps(
                 # the number of occurrence of current value) and assign the ranks
                 # based on the starting index of the current group (grp_start)
                 # and the current index
-                if (next_val_diff or group_changed
-                        or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])):
+                if (next_val_diff or group_changed or (check_mask and
+                                                       (mask[sort_indexer[i]]
+                                                        ^ mask[sort_indexer[i+1]]))):
 
                     # If keep_na, check for missing values and assign back
                     # to the result where appropriate
-                    if keep_na and mask[lexsort_indexer[i]]:
+                    if keep_na and check_mask and mask[sort_indexer[i]]:
                         grp_na_count = dups
                         for j in range(i - dups + 1, i + 1):
-                            out[lexsort_indexer[j]] = NaN
+                            out[sort_indexer[j]] = NaN
                     elif tiebreak == TIEBREAK_AVERAGE:
                         for j in range(i - dups + 1, i + 1):
-                            out[lexsort_indexer[j]] = sum_ranks / <float64_t>dups
+                            out[sort_indexer[j]] = sum_ranks / <float64_t>dups
                     elif tiebreak == TIEBREAK_MIN:
                         for j in range(i - dups + 1, i + 1):
-                            out[lexsort_indexer[j]] = i - grp_start - dups + 2
+                            out[sort_indexer[j]] = i - grp_start - dups + 2
                     elif tiebreak == TIEBREAK_MAX:
                         for j in range(i - dups + 1, i + 1):
-                            out[lexsort_indexer[j]] = i - grp_start + 1
+                            out[sort_indexer[j]] = i - grp_start + 1
 
                     # With n as the previous rank in the group and m as the number
                     # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
                     # then rankings should be n + 1, n + 2 ... n + m
                     elif tiebreak == TIEBREAK_FIRST:
                         for j in range(i - dups + 1, i + 1):
-                            out[lexsort_indexer[j]] = j + 1 - grp_start
+                            out[sort_indexer[j]] = j + 1 - grp_start
 
                     # If TIEBREAK_FIRST and descending, the ranking should be
                     # n + m, n + (m - 1) ... n + 1. This is equivalent to
                     # (i - dups + 1) + (i - j + 1) - grp_start
                     elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                         for j in range(i - dups + 1, i + 1):
-                            out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
+                            out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
                     elif tiebreak == TIEBREAK_DENSE:
                         for j in range(i - dups + 1, i + 1):
-                            out[lexsort_indexer[j]] = grp_vals_seen
+                            out[sort_indexer[j]] = grp_vals_seen
 
                     # Look forward to the next value (using the sorting in
                     # lexsort_indexer). If the value does not equal the current
@@ -1205,9 +1183,9 @@ def rank_1d(
                     # that a new value is coming up. The conditional also needs
                     # to handle nan equality and the end of iteration. If group
                     # changes we do not record seeing a new value in the group
-                    if not group_changed and (next_val_diff or
-                                              (mask[lexsort_indexer[i]]
-                                               ^ mask[lexsort_indexer[i+1]])):
+                    if not group_changed and (next_val_diff or (check_mask and
+                                              (mask[sort_indexer[i]]
+                                               ^ mask[sort_indexer[i+1]]))):
                         dups = sum_ranks = 0
                         grp_vals_seen += 1
 
@@ -1231,19 +1209,112 @@ def rank_1d(
                             grp_size = grp_vals_seen - (grp_na_count > 0)
 
                         for j in range(grp_start, i + 1):
-                            grp_sizes[lexsort_indexer[j]] = grp_size
+                            grp_sizes[sort_indexer[j]] = grp_size
 
                         dups = sum_ranks = 0
                         grp_na_count = 0
                         grp_start = i + 1
                         grp_vals_seen = 1
-
-    if pct:
+    else:
         for i in range(N):
-            if grp_sizes[i] != 0:
-                out[i] = out[i] / grp_sizes[i]
+            at_end = i == N - 1
+
+            # dups and sum_ranks will be incremented each loop where
+            # the value / group remains the same, and should be reset
+            # when either of those change. Used to calculate tiebreakers
+            dups += 1
+            sum_ranks += i - grp_start + 1
+
+            next_val_diff = at_end or (masked_vals[sort_indexer[i]]
+                                       != masked_vals[sort_indexer[i+1]])
+
+            # We'll need this check later anyway to determine group size, so just
+            # compute it here since shortcircuiting won't help
+            group_changed = at_end or (check_labels and
+                                       (labels[sort_indexer[i]]
+                                        != labels[sort_indexer[i+1]]))
+
+            # Update out only when there is a transition of values or labels.
+            # When a new value or group is encountered, go back #dups steps(
+            # the number of occurrence of current value) and assign the ranks
+            # based on the starting index of the current group (grp_start)
+            # and the current index
+            if (next_val_diff or group_changed
+                or (check_mask and
+                    (mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))):
+
+                # If keep_na, check for missing values and assign back
+                # to the result where appropriate
+                if keep_na and check_mask and mask[sort_indexer[i]]:
+                    grp_na_count = dups
+                    for j in range(i - dups + 1, i + 1):
+                        out[sort_indexer[j]] = NaN
+                elif tiebreak == TIEBREAK_AVERAGE:
+                    for j in range(i - dups + 1, i + 1):
+                        out[sort_indexer[j]] = sum_ranks / <float64_t>dups
+                elif tiebreak == TIEBREAK_MIN:
+                    for j in range(i - dups + 1, i + 1):
+                        out[sort_indexer[j]] = i - grp_start - dups + 2
+                elif tiebreak == TIEBREAK_MAX:
+                    for j in range(i - dups + 1, i + 1):
+                        out[sort_indexer[j]] = i - grp_start + 1
+
+                # With n as the previous rank in the group and m as the number
+                # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
+                # then rankings should be n + 1, n + 2 ... n + m
+                elif tiebreak == TIEBREAK_FIRST:
+                    for j in range(i - dups + 1, i + 1):
+                        out[sort_indexer[j]] = j + 1 - grp_start
+
+                # If TIEBREAK_FIRST and descending, the ranking should be
+                # n + m, n + (m - 1) ... n + 1. This is equivalent to
+                # (i - dups + 1) + (i - j + 1) - grp_start
+                elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+                    for j in range(i - dups + 1, i + 1):
+                        out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
+                elif tiebreak == TIEBREAK_DENSE:
+                    for j in range(i - dups + 1, i + 1):
+                        out[sort_indexer[j]] = grp_vals_seen
+
+                # Look forward to the next value (using the sorting in
+                # lexsort_indexer). If the value does not equal the current
+                # value then we need to reset the dups and sum_ranks, knowing
+                # that a new value is coming up. The conditional also needs
+                # to handle nan equality and the end of iteration. If group
+                # changes we do not record seeing a new value in the group
+                if not group_changed and (next_val_diff
+                                          or (check_mask and
+                                              (mask[sort_indexer[i]]
+                                               ^ mask[sort_indexer[i+1]]))):
+                    dups = sum_ranks = 0
+                    grp_vals_seen += 1
+
+                # Similar to the previous conditional, check now if we are
+                # moving to a new group. If so, keep track of the index where
+                # the new group occurs, so the tiebreaker calculations can
+                # decrement that from their position. Fill in the size of each
+                # group encountered (used by pct calculations later). Also be
+                # sure to reset any of the items helping to calculate dups
+                if group_changed:
 
-    return out
+                    # If not dense tiebreak, group size used to compute
+                    # percentile will be # of non-null elements in group
+                    if tiebreak != TIEBREAK_DENSE:
+                        grp_size = i - grp_start + 1 - grp_na_count
+
+                    # Otherwise, it will be the number of distinct values
+                    # in the group, subtracting 1 if NaNs are present
+                    # since that is a distinct value we shouldn't count
+                    else:
+                        grp_size = grp_vals_seen - (grp_na_count > 0)
+
+                    for j in range(grp_start, i + 1):
+                        grp_sizes[sort_indexer[j]] = grp_size
+
+                    dups = sum_ranks = 0
+                    grp_na_count = 0
+                    grp_start = i + 1
+                    grp_vals_seen = 1
 
 
 def rank_2d(

From 4b0641ecfb96f49a80a221915e0a013b0d783b08 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 16:26:41 -0400
Subject: [PATCH 02/25] Fixup docstring

---
 pandas/_libs/algos.pyx | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 362df3bf7e710..9ab12607e7789 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -954,10 +954,10 @@ def rank_1d(
         int64_t[::1] grp_sizes
         intp_t[:] lexsort_indexer
         float64_t[::1] out
-        ndarray [rank_t, ndim=1] masked_vals
+        ndarray[rank_t, ndim=1] masked_vals
         rank_t[:] masked_vals_memview
         uint8_t[:] mask
-        bint keep_na, check_labels
+        bint keep_na, check_labels, check_mask
         rank_t nan_fill_val
 
     tiebreak = tiebreakers[ties_method]
@@ -1088,8 +1088,11 @@ cdef void rank_sorted_1d(
         Array to store group counts.
     labels : See rank_1d.__doc__
     sort_indexer : intp_t[:]
+        Array of indices which sorts masked_vals
     masked_vals : rank_t[:]
+        The values input to rank_1d, with missing values replaced by fill values
     mask : uint8_t[:]
+        Array where entries are True if the value is missing, False otherwise
     tiebreak : TiebreakEnumType
         See rank_1d.__doc__ for the different modes
     check_mask : bint

From b6dd4a6010d75af32725392c1e49e7bea352c0b4 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 18:31:35 -0400
Subject: [PATCH 03/25] WIP

---
 pandas/_libs/algos.pyx                | 172 +++++++++++---------------
 pandas/_libs/algos_take_helper.pxi.in |  30 -----
 2 files changed, 73 insertions(+), 129 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 9ab12607e7789..fcdc0d1ccaf27 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1038,30 +1038,30 @@ def rank_1d(
     if not ascending:
         lexsort_indexer = lexsort_indexer[::-1]
 
-    with nogil:
-        rank_sorted_1d(
-            out,
-            grp_sizes,
-            labels,
-            lexsort_indexer,
-            masked_vals_memview,
-            mask,
-            tiebreak,
-            check_mask,
-            check_labels,
-            keep_na,
-            N,
-        )
-        if pct:
-            for i in range(N):
-                if grp_sizes[i] != 0:
-                    out[i] = out[i] / grp_sizes[i]
+    # with nogil:
+    rank_sorted_1d(
+        out,
+        grp_sizes,
+        labels,
+        lexsort_indexer,
+        masked_vals_memview,
+        mask,
+        tiebreak,
+        check_mask,
+        check_labels,
+        keep_na,
+        N,
+    )
+    if pct:
+        for i in range(N):
+            if grp_sizes[i] != 0:
+                out[i] = out[i] / grp_sizes[i]
 
     return np.array(out)
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
 cdef void rank_sorted_1d(
     float64_t[::1] out,
     int64_t[::1] grp_sizes,
@@ -1075,7 +1075,7 @@ cdef void rank_sorted_1d(
     bint check_labels,
     bint keep_na,
     Py_ssize_t N,
-) nogil:
+):
     """
     See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should
     be handled in the caller. Note that `out` and `grp_sizes` are modified inplace.
@@ -1119,7 +1119,7 @@ cdef void rank_sorted_1d(
     # values / masked_vals arrays
     # TODO: de-duplicate once cython supports conditional nogil
     if rank_t is object:
-        with gil:
+        # with gil:
             for i in range(N):
                 at_end = i == N - 1
 
@@ -1220,6 +1220,7 @@ cdef void rank_sorted_1d(
                         grp_vals_seen = 1
     else:
         for i in range(N):
+            print(i)
             at_end = i == N - 1
 
             # dups and sum_ranks will be incremented each loop where
@@ -1227,15 +1228,18 @@ cdef void rank_sorted_1d(
             # when either of those change. Used to calculate tiebreakers
             dups += 1
             sum_ranks += i - grp_start + 1
-
+            print(sort_indexer[i])
+            print(sort_indexer[i+1])
             next_val_diff = at_end or (masked_vals[sort_indexer[i]]
                                        != masked_vals[sort_indexer[i+1]])
+            print("here")
 
             # We'll need this check later anyway to determine group size, so just
             # compute it here since shortcircuiting won't help
             group_changed = at_end or (check_labels and
                                        (labels[sort_indexer[i]]
                                         != labels[sort_indexer[i+1]]))
+            print("here")
 
             # Update out only when there is a transition of values or labels.
             # When a new value or group is encountered, go back #dups steps(
@@ -1333,17 +1337,16 @@ def rank_2d(
     Fast NaN-friendly version of ``scipy.stats.rankdata``.
     """
     cdef:
-        Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
-        Py_ssize_t infs
-        ndarray[float64_t, ndim=2] ranks
+        Py_ssize_t k, n, col
+        float64_t[::1, :] out  # Column-major so columns are contiguous
+        int64_t[::1, :] grp_sizes
+        const intp_t[:] labels
         ndarray[rank_t, ndim=2] values
-        ndarray[intp_t, ndim=2] argsort_indexer
-        ndarray[uint8_t, ndim=2] mask
-        rank_t val, nan_value
-        float64_t count, sum_ranks = 0.0
-        int tiebreak = 0
-        int64_t idx
-        bint check_mask, condition, keep_na
+        rank_t[:, :] masked_vals_memview
+        intp_t[:, :] argsort_indexer
+        uint8_t[:, :] mask
+        TiebreakEnumType tiebreak
+        bint check_mask, keep_na
 
     tiebreak = tiebreakers[ties_method]
 
@@ -1396,85 +1399,56 @@ def rank_2d(
         mask = np.zeros_like(values, dtype=bool)
 
     n, k = (<object>values).shape
-    ranks = np.empty((n, k), dtype='f8')
+    out = np.empty((n, k), dtype='f8', order='F')
+    grp_sizes = np.ones((n, k), dtype='i8', order='F')
+    labels = np.ones(n, dtype=np.intp)
 
     if tiebreak == TIEBREAK_FIRST:
         # need to use a stable sort here
-        argsort_indexer = values.argsort(axis=1, kind='mergesort')
+        argsort_indexer = values.argsort(axis=1, kind='mergesort').astype(
+            np.intp, copy=False
+        )
         if not ascending:
             tiebreak = TIEBREAK_FIRST_DESCENDING
     else:
-        argsort_indexer = values.argsort(1)
+        argsort_indexer = values.argsort(1).astype(np.intp, copy=False)
 
     if not ascending:
         argsort_indexer = argsort_indexer[:, ::-1]
 
-    values = _take_2d(values, argsort_indexer)
-
-    for i in range(n):
-        dups = sum_ranks = infs = 0
-
-        total_tie_count = 0
-        count = 0.0
-        for j in range(k):
-            val = values[i, j]
-            idx = argsort_indexer[i, j]
-            if keep_na and check_mask and mask[i, idx]:
-                ranks[i, idx] = NaN
-                infs += 1
-                continue
-
-            count += 1.0
-
-            sum_ranks += (j - infs) + 1
-            dups += 1
+    masked_vals_memview = values
+    print(np.array(argsort_indexer))
+
+    print(k)
+    print(n)
+    print(values)
+    for col in range(k):
+        print("col" + str(col))
+        # print(np.array(masked_vals_memview[:, col]))
+
+        print(np.array(argsort_indexer[:, col]))
+        print(np.array(masked_vals_memview[:, col]))
+        # print(np.array(mask[:, col]))
+        # print(np.array(grp_sizes[:, col]))
+        # print(np.array(out[:, col]))
+        rank_sorted_1d(
+            out[:, col],
+            grp_sizes[:, col],
+            labels,
+            argsort_indexer[:, col],
+            masked_vals_memview[:, col],
+            mask[:, col],
+            tiebreak,
+            check_mask,
+            False,
+            keep_na,
+            n,
+        )
 
-            if rank_t is object:
-                condition = (
-                    j == k - 1 or
-                    are_diff(values[i, j + 1], val) or
-                    (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
-                )
-            else:
-                condition = (
-                    j == k - 1 or
-                    values[i, j + 1] != val or
-                    (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
-                )
-
-            if condition:
-                if tiebreak == TIEBREAK_AVERAGE:
-                    for z in range(j - dups + 1, j + 1):
-                        ranks[i, argsort_indexer[i, z]] = sum_ranks / dups
-                elif tiebreak == TIEBREAK_MIN:
-                    for z in range(j - dups + 1, j + 1):
-                        ranks[i, argsort_indexer[i, z]] = j - dups + 2
-                elif tiebreak == TIEBREAK_MAX:
-                    for z in range(j - dups + 1, j + 1):
-                        ranks[i, argsort_indexer[i, z]] = j + 1
-                elif tiebreak == TIEBREAK_FIRST:
-                    if rank_t is object:
-                        raise ValueError('first not supported for non-numeric data')
-                    else:
-                        for z in range(j - dups + 1, j + 1):
-                            ranks[i, argsort_indexer[i, z]] = z + 1
-                elif tiebreak == TIEBREAK_FIRST_DESCENDING:
-                    for z in range(j - dups + 1, j + 1):
-                        ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2
-                elif tiebreak == TIEBREAK_DENSE:
-                    total_tie_count += 1
-                    for z in range(j - dups + 1, j + 1):
-                        ranks[i, argsort_indexer[i, z]] = total_tie_count
-                sum_ranks = dups = 0
-        if pct:
-            if tiebreak == TIEBREAK_DENSE:
-                ranks[i, :] /= total_tie_count
-            else:
-                ranks[i, :] /= count
     if axis == 0:
-        return ranks.T
+        return np.array(out.T)
     else:
-        return ranks
+        return np.array(out)
 
 
 ctypedef fused diff_t:
diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in
index 11679fc432edc..1ad54216a1532 100644
--- a/pandas/_libs/algos_take_helper.pxi.in
+++ b/pandas/_libs/algos_take_helper.pxi.in
@@ -244,33 +244,3 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
                     {{endif}}
 
 {{endfor}}
-
-# ----------------------------------------------------------------------
-# take_2d internal function
-# ----------------------------------------------------------------------
-
-ctypedef fused take_t:
-    float64_t
-    uint64_t
-    int64_t
-    object
-
-
-cdef _take_2d(ndarray[take_t, ndim=2] values, ndarray[intp_t, ndim=2] idx):
-    cdef:
-        Py_ssize_t i, j, N, K
-        ndarray[intp_t, ndim=2, cast=True] indexer = idx
-        ndarray[take_t, ndim=2] result
-
-    N, K = (<object>values).shape
-
-    if take_t is object:
-        # evaluated at compile-time
-        result = values.copy()
-    else:
-        result = np.empty_like(values)
-
-    for i in range(N):
-        for j in range(K):
-            result[i, j] = values[i, indexer[i, j]]
-    return result

From 953b188cb429ea89c43442821e84b1a3d443a158 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 20:36:07 -0400
Subject: [PATCH 04/25] WIP

---
 pandas/_libs/algos.pyx | 103 ++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 63 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index fcdc0d1ccaf27..b886721ceba12 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1050,12 +1050,9 @@ def rank_1d(
         check_mask,
         check_labels,
         keep_na,
+        pct,
         N,
     )
-    if pct:
-        for i in range(N):
-            if grp_sizes[i] != 0:
-                out[i] = out[i] / grp_sizes[i]
 
     return np.array(out)
 
@@ -1074,6 +1071,7 @@ cdef void rank_sorted_1d(
     bint check_mask,
     bint check_labels,
     bint keep_na,
+    bint pct,
     Py_ssize_t N,
 ):
     """
@@ -1220,7 +1218,6 @@ cdef void rank_sorted_1d(
                         grp_vals_seen = 1
     else:
         for i in range(N):
-            print(i)
             at_end = i == N - 1
 
             # dups and sum_ranks will be incremented each loop where
@@ -1228,18 +1225,14 @@ cdef void rank_sorted_1d(
             # when either of those change. Used to calculate tiebreakers
             dups += 1
             sum_ranks += i - grp_start + 1
-            print(sort_indexer[i])
-            print(sort_indexer[i+1])
             next_val_diff = at_end or (masked_vals[sort_indexer[i]]
                                        != masked_vals[sort_indexer[i+1]])
-            print("here")
 
             # We'll need this check later anyway to determine group size, so just
             # compute it here since shortcircuiting won't help
             group_changed = at_end or (check_labels and
                                        (labels[sort_indexer[i]]
                                         != labels[sort_indexer[i+1]]))
-            print("here")
 
             # Update out only when there is a transition of values or labels.
             # When a new value or group is encountered, go back #dups steps(
@@ -1323,6 +1316,11 @@ cdef void rank_sorted_1d(
                     grp_start = i + 1
                     grp_vals_seen = 1
 
+    if pct:
+        for i in range(N):
+            if grp_sizes[i] != 0:
+                out[i] = out[i] / grp_sizes[i]
+
 
 def rank_2d(
     ndarray[rank_t, ndim=2] in_arr,
@@ -1355,7 +1353,7 @@ def rank_2d(
     # For cases where a mask is not possible, we can avoid mask checks
     check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
 
-    if axis == 0:
+    if axis == 1:
         values = np.asarray(in_arr).T.copy()
     else:
         values = np.asarray(in_arr).copy()
@@ -1364,73 +1362,51 @@ def rank_2d(
         if values.dtype != np.object_:
             values = values.astype('O')
 
-    if check_mask:
-        if ascending ^ (na_option == 'top'):
-            if rank_t is object:
-                nan_value = Infinity()
-            elif rank_t is float64_t:
-                nan_value = np.inf
-
-            # int64 and datetimelike
-            else:
-                nan_value = np.iinfo(np.int64).max
+    if rank_t is object:
+        mask = missing.isnaobj2d(values)
+    elif rank_t is int64_t and is_datetimelike:
+        mask = (values == NPY_NAT).astype(np.uint8)
+    elif rank_t is float64_t:
+        mask = np.isnan(values).astype(np.uint8)
+    else:
+        mask = np.zeros_like(values, dtype=np.uint8)
 
+    if ascending ^ (na_option == 'top'):
+        if rank_t is object:
+            nan_fill_val = Infinity()
+        elif rank_t is int64_t:
+            nan_fill_val = np.iinfo(np.int64).max
+        elif rank_t is uint64_t:
+            nan_fill_val = np.iinfo(np.uint64).max
         else:
-            if rank_t is object:
-                nan_value = NegInfinity()
-            elif rank_t is float64_t:
-                nan_value = -np.inf
-
-            # int64 and datetimelike
-            else:
-                nan_value = NPY_NAT
+            nan_fill_val = np.inf
+        order = (values, mask)
 
+    else:
         if rank_t is object:
-            mask = missing.isnaobj2d(values)
-        elif rank_t is float64_t:
-            mask = np.isnan(values)
-
-        # int64 and datetimelike
+            nan_fill_val = NegInfinity()
+        elif rank_t is int64_t:
+            nan_fill_val = NPY_NAT
+        elif rank_t is uint64_t:
+            nan_fill_val = 0
         else:
-            mask = values == NPY_NAT
+            nan_fill_val = -np.inf
 
-        np.putmask(values, mask, nan_value)
-    else:
-        mask = np.zeros_like(values, dtype=bool)
+        order = (values, ~np.array(mask))
+
+    np.putmask(values, mask, nan_fill_val)
 
     n, k = (<object>values).shape
     out = np.empty((n, k), dtype='f8', order='F')
     grp_sizes = np.ones((n, k), dtype='i8', order='F')
-    labels = np.ones(n, dtype=np.intp)
-
-    if tiebreak == TIEBREAK_FIRST:
-        # need to use a stable sort here
-        argsort_indexer = values.argsort(axis=1, kind='mergesort').astype(
-            np.intp, copy=False
-        )
-        if not ascending:
-            tiebreak = TIEBREAK_FIRST_DESCENDING
-    else:
-        argsort_indexer = values.argsort(1).astype(np.intp, copy=False)
+    labels = np.zeros(n, dtype=np.intp)
 
+    argsort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
     if not ascending:
-        argsort_indexer = argsort_indexer[:, ::-1]
+        argsort_indexer = argsort_indexer[::-1, :]
 
     masked_vals_memview = values
-    print(np.array(argsort_indexer))
-
-    print(k)
-    print(n)
-    print(values)
     for col in range(k):
-        print("col" + str(col))
-        # print(np.array(masked_vals_memview[:, col]))
-
-        print(np.array(argsort_indexer[:, col]))
-        print(np.array(masked_vals_memview[:, col]))
-        # print(np.array(mask[:, col]))
-        # print(np.array(grp_sizes[:, col]))
-        # print(np.array(out[:, col]))
         rank_sorted_1d(
             out[:, col],
             grp_sizes[:, col],
@@ -1442,10 +1418,11 @@ def rank_2d(
             check_mask,
             False,
             keep_na,
+            pct,
             n,
         )
 
-    if axis == 0:
+    if axis == 1:
         return np.array(out.T)
     else:
         return np.array(out)

From 254b9974025b7f8ff5851911c3cb078d953995d2 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 20:38:30 -0400
Subject: [PATCH 05/25] premerge

---
 pandas/_libs/algos.pyx | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index b886721ceba12..61b5a6140e7d4 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1038,27 +1038,27 @@ def rank_1d(
     if not ascending:
         lexsort_indexer = lexsort_indexer[::-1]
 
-    # with nogil:
-    rank_sorted_1d(
-        out,
-        grp_sizes,
-        labels,
-        lexsort_indexer,
-        masked_vals_memview,
-        mask,
-        tiebreak,
-        check_mask,
-        check_labels,
-        keep_na,
-        pct,
-        N,
-    )
+    with nogil:
+        rank_sorted_1d(
+            out,
+            grp_sizes,
+            labels,
+            lexsort_indexer,
+            masked_vals_memview,
+            mask,
+            tiebreak,
+            check_mask,
+            check_labels,
+            keep_na,
+            pct,
+            N,
+        )
 
     return np.array(out)
 
 
-# @cython.wraparound(False)
-# @cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.boundscheck(False)
 cdef void rank_sorted_1d(
     float64_t[::1] out,
     int64_t[::1] grp_sizes,
@@ -1117,7 +1117,7 @@ cdef void rank_sorted_1d(
     # values / masked_vals arrays
     # TODO: de-duplicate once cython supports conditional nogil
     if rank_t is object:
-        # with gil:
+        with gil:
             for i in range(N):
                 at_end = i == N - 1
 

From 29dc59090e865ad69d7c5c1e647895916b315bdb Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 21:25:24 -0400
Subject: [PATCH 06/25] REF: give ranks same nan filling

---
 pandas/_libs/algos.pyx | 100 +++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 03f4ce273de6e..4fd515113316c 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -931,6 +931,32 @@ ctypedef fused rank_t:
     int64_t
 
 
+cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, ndarray[rank_t, ndim=1] _):
+    """
+    Return the value we'll use to represent missing values when sorting depending
+    on if we'd like missing values to end up at the top/bottom. (The second parameter
+    is unused, but needed for fused type specialization)
+    """
+    if rank_nans_highest:
+        if rank_t is object:
+            return Infinity()
+        elif rank_t is int64_t:
+            return np.iinfo(np.int64).max
+        elif rank_t is uint64_t:
+            return np.iinfo(np.uint64).max
+        else:
+            return np.inf
+    else:
+        if rank_t is object:
+            return NegInfinity()
+        elif rank_t is int64_t:
+            return NPY_NAT
+        elif rank_t is uint64_t:
+            return 0
+        else:
+            return -np.inf
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def rank_1d(
@@ -980,7 +1006,7 @@ def rank_1d(
         ndarray[rank_t, ndim=1] masked_vals
         rank_t[:] masked_vals_memview
         uint8_t[:] mask
-        bint keep_na, check_labels, check_mask
+        bint keep_na, nans_rank_highest, check_labels, check_mask
         rank_t nan_fill_val
 
     tiebreak = tiebreakers[ties_method]
@@ -1026,26 +1052,11 @@ def rank_1d(
     # If descending, fill with highest value since descending
     # will flip the ordering to still end up with lowest rank.
     # Symmetric logic applies to `na_option == 'bottom'`
-    if ascending ^ (na_option == 'top'):
-        if rank_t is object:
-            nan_fill_val = Infinity()
-        elif rank_t is int64_t:
-            nan_fill_val = np.iinfo(np.int64).max
-        elif rank_t is uint64_t:
-            nan_fill_val = np.iinfo(np.uint64).max
-        else:
-            nan_fill_val = np.inf
+    nans_rank_highest = ascending ^ (na_option == 'top')
+    nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, masked_vals)
+    if nans_rank_highest:
         order = (masked_vals, mask, labels)
     else:
-        if rank_t is object:
-            nan_fill_val = NegInfinity()
-        elif rank_t is int64_t:
-            nan_fill_val = NPY_NAT
-        elif rank_t is uint64_t:
-            nan_fill_val = 0
-        else:
-            nan_fill_val = -np.inf
-
         order = (masked_vals, ~(np.array(mask, copy=False)), labels)
 
     np.putmask(masked_vals, mask, nan_fill_val)
@@ -1073,12 +1084,9 @@ def rank_1d(
             check_mask,
             check_labels,
             keep_na,
+            pct,
             N,
         )
-        if pct:
-            for i in range(N):
-                if grp_sizes[i] != 0:
-                    out[i] = out[i] / grp_sizes[i]
 
     return np.array(out)
 
@@ -1097,6 +1105,7 @@ cdef void rank_sorted_1d(
     bint check_mask,
     bint check_labels,
     bint keep_na,
+    bint pct,
     Py_ssize_t N,
 ) nogil:
     """
@@ -1108,7 +1117,7 @@ cdef void rank_sorted_1d(
     out : float64_t[::1]
         Array to store computed ranks
     grp_sizes : int64_t[::1]
-        Array to store group counts.
+        Array to store group counts, only used if pct=True
     labels : See rank_1d.__doc__
     sort_indexer : intp_t[:]
         Array of indices which sorts masked_vals
@@ -1118,12 +1127,14 @@ cdef void rank_sorted_1d(
         Array where entries are True if the value is missing, False otherwise
     tiebreak : TiebreakEnumType
         See rank_1d.__doc__ for the different modes
-    check_mask : bint
+    check_mask : bool
         If False, assumes the mask is all False to skip mask indexing
-    check_labels : bint
+    check_labels : bool
         If False, assumes all labels are the same to skip group handling logic
-    keep_na : bint
+    keep_na : bool
         Whether or not to keep nulls
+    pct : bool
+        Compute percentage rank of data within each group
     N : Py_ssize_t
         The number of elements to rank. Note: it is not always true that
         N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
@@ -1342,6 +1353,11 @@ cdef void rank_sorted_1d(
                     grp_start = i + 1
                     grp_vals_seen = 1
 
+    if pct:
+        for i in range(N):
+            if grp_sizes[i] != 0:
+                out[i] = out[i] / grp_sizes[i]
+
 
 def rank_2d(
     ndarray[rank_t, ndim=2] in_arr,
@@ -1360,13 +1376,14 @@ def rank_2d(
         Py_ssize_t infs
         ndarray[float64_t, ndim=2] ranks
         ndarray[rank_t, ndim=2] values
+        ndarray[rank_t, ndim=1] unused
         ndarray[intp_t, ndim=2] argsort_indexer
         ndarray[uint8_t, ndim=2] mask
-        rank_t val, nan_value
+        rank_t val, nan_fill_val
         float64_t count, sum_ranks = 0.0
         int tiebreak = 0
         int64_t idx
-        bint check_mask, condition, keep_na
+        bint check_mask, condition, keep_na, nans_rank_highest
 
     tiebreak = tiebreakers[ties_method]
 
@@ -1384,26 +1401,11 @@ def rank_2d(
         if values.dtype != np.object_:
             values = values.astype('O')
 
+    nans_rank_highest = ascending ^ (na_option == 'top')
     if check_mask:
-        if ascending ^ (na_option == 'top'):
-            if rank_t is object:
-                nan_value = Infinity()
-            elif rank_t is float64_t:
-                nan_value = np.inf
-
-            # int64 and datetimelike
-            else:
-                nan_value = np.iinfo(np.int64).max
-
-        else:
-            if rank_t is object:
-                nan_value = NegInfinity()
-            elif rank_t is float64_t:
-                nan_value = -np.inf
-
-            # int64 and datetimelike
-            else:
-                nan_value = NPY_NAT
+        # For fused type specialization
+        unused = values[:, 0]
+        nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused)
 
         if rank_t is object:
             mask = missing.isnaobj2d(values)
@@ -1414,7 +1416,7 @@ def rank_2d(
         else:
             mask = values == NPY_NAT
 
-        np.putmask(values, mask, nan_value)
+        np.putmask(values, mask, nan_fill_val)
     else:
         mask = np.zeros_like(values, dtype=bool)
 

From 974650d76e1eca3d8dcb1a5e646e0b615dfdc6dc Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 21:46:07 -0400
Subject: [PATCH 07/25] WIP

---
 pandas/_libs/algos.pyx | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index bf5667b93f455..86690b44b1133 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1382,6 +1382,7 @@ def rank_2d(
         uint8_t[:, :] mask
         TiebreakEnumType tiebreak
         bint check_mask, keep_na, nans_rank_highest
+        rank_t nan_fill_val
 
     tiebreak = tiebreakers[ties_method]
 
@@ -1404,6 +1405,15 @@ def rank_2d(
         # For fused type specialization
         unused = values[:, 0]
         nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused)
+
+        if rank_t is object:
+            mask = missing.isnaobj2d(values).astype(np.uint8)
+        elif rank_t is float64_t:
+            mask = np.isnan(values).astype(np.uint8)
+
+        # int64 and datetimelike
+        else:
+            mask = (values == NPY_NAT).astype(np.uint8)
         np.putmask(values, mask, nan_fill_val)
     else:
         mask = np.zeros_like(values, dtype=np.uint8)

From b840b74040de4390ccd86cdee7e536b297f6a590 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 21:58:55 -0400
Subject: [PATCH 08/25] Handle empty case early

---
 pandas/_libs/algos.pyx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 4fd515113316c..c55d1e9898b79 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1385,6 +1385,9 @@ def rank_2d(
         int64_t idx
         bint check_mask, condition, keep_na, nans_rank_highest
 
+    if in_arr.shape[0] == 0 or in_arr.shape[1] == 0:
+        return np.empty_like(in_arr, dtype="f8")
+
     tiebreak = tiebreakers[ties_method]
 
     keep_na = na_option == 'keep'

From f099bb0d0fd18440f1210adf13762f14aa3259e2 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 22:02:06 -0400
Subject: [PATCH 09/25] Handle empty case early

---
 pandas/_libs/algos.pyx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 86690b44b1133..a3c1629209b9f 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1384,6 +1384,9 @@ def rank_2d(
         bint check_mask, keep_na, nans_rank_highest
         rank_t nan_fill_val
 
+    if in_arr.shape[0] == 0 or in_arr.shape[1] == 0:
+        return np.empty_like(in_arr, dtype="f8")
+
     tiebreak = tiebreakers[ties_method]
 
     keep_na = na_option == 'keep'

From 4aa4f8bbce26b4706381eeeb64851462d3718067 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 23:15:17 -0400
Subject: [PATCH 10/25] WIP

---
 pandas/_libs/algos.pyx                  | 36 ++++++++++++++++---------
 pandas/tests/frame/methods/test_rank.py | 30 ++++++++++++---------
 2 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index a3c1629209b9f..3a33206384fa8 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1377,7 +1377,7 @@ def rank_2d(
         const intp_t[:] labels
         ndarray[rank_t, ndim=2] values
         ndarray[rank_t, ndim=1] unused
-        rank_t[:, :] masked_vals_memview
+        rank_t[:, :] masked_vals
         intp_t[:, :] argsort_indexer
         uint8_t[:, :] mask
         TiebreakEnumType tiebreak
@@ -1388,6 +1388,9 @@ def rank_2d(
         return np.empty_like(in_arr, dtype="f8")
 
     tiebreak = tiebreakers[ties_method]
+    if tiebreak == TIEBREAK_FIRST:
+        if not ascending:
+            tiebreak = TIEBREAK_FIRST_DESCENDING
 
     keep_na = na_option == 'keep'
 
@@ -1403,20 +1406,29 @@ def rank_2d(
         if values.dtype != np.object_:
             values = values.astype('O')
 
+    if rank_t is object:
+        mask = missing.isnaobj2d(values)
+    elif rank_t is int64_t and is_datetimelike:
+        mask = (values == NPY_NAT).astype(np.uint8)
+    elif rank_t is float64_t:
+        mask = np.isnan(values).astype(np.uint8)
+    else:
+        mask = np.zeros_like(values, dtype=np.uint8)
+
     nans_rank_highest = ascending ^ (na_option == 'top')
     if check_mask:
         # For fused type specialization
         unused = values[:, 0]
         nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused)
 
-        if rank_t is object:
-            mask = missing.isnaobj2d(values).astype(np.uint8)
-        elif rank_t is float64_t:
-            mask = np.isnan(values).astype(np.uint8)
-
-        # int64 and datetimelike
-        else:
-            mask = (values == NPY_NAT).astype(np.uint8)
+        # if rank_t is object:
+        #     mask = missing.isnaobj2d(values).view(np.uint8)
+        # elif rank_t is float64_t:
+        #     mask = np.isnan(values).view(np.uint8)
+        #
+        # # int64 and datetimelike
+        # else:
+        #     mask = (values == NPY_NAT).view(np.uint8)
         np.putmask(values, mask, nan_fill_val)
     else:
         mask = np.zeros_like(values, dtype=np.uint8)
@@ -1424,7 +1436,7 @@ def rank_2d(
     if nans_rank_highest:
         order = (values, mask)
     else:
-        order = (values, ~np.array(mask))
+        order = (values, ~np.array(mask, copy=False))
 
     n, k = (<object>values).shape
     out = np.empty((n, k), dtype='f8', order='F')
@@ -1436,14 +1448,14 @@ def rank_2d(
         argsort_indexer = argsort_indexer[::-1, :]
 
     # putmask doesn't accept a memoryview, so we assign as a separate step
-    masked_vals_memview = values
+    masked_vals = values
     for col in range(k):
         rank_sorted_1d(
             out[:, col],
             grp_sizes[:, col],
             labels,
             argsort_indexer[:, col],
-            masked_vals_memview[:, col],
+            masked_vals[:, col],
             mask[:, col],
             tiebreak,
             check_mask,
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
index 5ba4ab4408f11..36dd6226866ca 100644
--- a/pandas/tests/frame/methods/test_rank.py
+++ b/pandas/tests/frame/methods/test_rank.py
@@ -246,13 +246,12 @@ def test_rank_methods_frame(self):
                     expected = DataFrame(sprank, columns=cols).astype("float64")
                     tm.assert_frame_equal(result, expected)
 
-    @td.skip_array_manager_not_yet_implemented
     @pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
+    @pytest.mark.parametrize("rank_method", ["average", "min", "max", "dense"])
     @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
-    def test_rank_descending(self, method, dtype):
-
+    def test_rank_descending(self, rank_method, dtype):
         if "i" in dtype:
-            df = self.df.dropna()
+            df = self.df.dropna().astype(dtype)
         else:
             df = self.df.astype(dtype)
 
@@ -260,18 +259,26 @@ def test_rank_descending(self, method, dtype):
         expected = (df.max() - df).rank()
         tm.assert_frame_equal(res, expected)
 
-        if method == "first" and dtype == "O":
-            return
-
-        expected = (df.max() - df).rank(method=method)
+        expected = (df.max() - df).rank(method=rank_method)
 
         if dtype != "O":
-            res2 = df.rank(method=method, ascending=False, numeric_only=True)
+            res2 = df.rank(method=rank_method, ascending=False, numeric_only=True)
             tm.assert_frame_equal(res2, expected)
 
-        res3 = df.rank(method=method, ascending=False, numeric_only=False)
+        res3 = df.rank(method=rank_method, ascending=False, numeric_only=False)
         tm.assert_frame_equal(res3, expected)
 
+    @pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
+    @pytest.mark.parametrize("ascending", [True, False])
+    def test_rank_first_ties(self, dtype, ascending, frame_or_series):
+        obj = frame_or_series([1, 1], dtype=dtype)
+        result = obj.rank(method="first", ascending=ascending)
+        expected_data = [1, 2]
+        if ascending:
+            expected_data = expected_data[::-1]
+        expected = frame_or_series(expected_data, dtype=np.float64)
+        tm.assert_equal(result, expected)
+
     @pytest.mark.parametrize("axis", [0, 1])
     @pytest.mark.parametrize("dtype", [None, object])
     def test_rank_2d_tie_methods(self, method, axis, dtype):
@@ -287,9 +294,6 @@ def _check2d(df, expected, method="average", axis=0):
             result = df.rank(method=method, axis=axis)
             tm.assert_frame_equal(result, exp_df)
 
-        disabled = {(object, "first")}
-        if (dtype, method) in disabled:
-            return
         frame = df if dtype is None else df.astype(dtype)
         _check2d(frame, self.results[method], method=method, axis=axis)
 

From c5ed688143a8ff9b278e473bad46b2140fb812bf Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 23:34:49 -0400
Subject: [PATCH 11/25] WIP

---
 pandas/tests/frame/methods/test_rank.py | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
index 36dd6226866ca..46988350c2367 100644
--- a/pandas/tests/frame/methods/test_rank.py
+++ b/pandas/tests/frame/methods/test_rank.py
@@ -247,9 +247,8 @@ def test_rank_methods_frame(self):
                     tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
-    @pytest.mark.parametrize("rank_method", ["average", "min", "max", "dense"])
     @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
-    def test_rank_descending(self, rank_method, dtype):
+    def test_rank_descending(self, method, dtype):
         if "i" in dtype:
             df = self.df.dropna().astype(dtype)
         else:
@@ -259,26 +258,15 @@ def test_rank_descending(self, rank_method, dtype):
         expected = (df.max() - df).rank()
         tm.assert_frame_equal(res, expected)
 
-        expected = (df.max() - df).rank(method=rank_method)
+        expected = (df.max() - df).rank(method=method)
 
         if dtype != "O":
-            res2 = df.rank(method=rank_method, ascending=False, numeric_only=True)
+            res2 = df.rank(method=method, ascending=False, numeric_only=True)
             tm.assert_frame_equal(res2, expected)
 
-        res3 = df.rank(method=rank_method, ascending=False, numeric_only=False)
+        res3 = df.rank(method=method, ascending=False, numeric_only=False)
         tm.assert_frame_equal(res3, expected)
 
-    @pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
-    @pytest.mark.parametrize("ascending", [True, False])
-    def test_rank_first_ties(self, dtype, ascending, frame_or_series):
-        obj = frame_or_series([1, 1], dtype=dtype)
-        result = obj.rank(method="first", ascending=ascending)
-        expected_data = [1, 2]
-        if ascending:
-            expected_data = expected_data[::-1]
-        expected = frame_or_series(expected_data, dtype=np.float64)
-        tm.assert_equal(result, expected)
-
     @pytest.mark.parametrize("axis", [0, 1])
     @pytest.mark.parametrize("dtype", [None, object])
     def test_rank_2d_tie_methods(self, method, axis, dtype):

From 7a04159b64d2a09e461f8b66fa1048375faf64b8 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 10 Jun 2021 12:57:57 -0400
Subject: [PATCH 12/25] Add object first test

---
 pandas/tests/frame/methods/test_rank.py | 32 +++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
index 46988350c2367..6c5831ad897d1 100644
--- a/pandas/tests/frame/methods/test_rank.py
+++ b/pandas/tests/frame/methods/test_rank.py
@@ -448,6 +448,38 @@ def test_rank_both_inf(self):
         result = df.rank()
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize(
+        "na_option,ascending,expected",
+        [
+            ("top", True, [3.0, 1.0, 2.0]),
+            ("top", False, [2.0, 1.0, 3.0]),
+            ("bottom", True, [2.0, 3.0, 1.0]),
+            ("bottom", False, [1.0, 3.0, 2.0]),
+        ],
+    )
+    def test_rank_inf_nans_na_option(
+        self, frame_or_series, method, na_option, ascending, expected
+    ):
+        obj = frame_or_series([np.inf, np.nan, -np.inf])
+        result = obj.rank(method=method, na_option=na_option, ascending=ascending)
+        expected = frame_or_series(expected)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "na_option,ascending,expected",
+        [
+            ("bottom", True, [1.0, 2.0, 4.0, 3.0]),
+            ("bottom", False, [1.0, 2.0, 4.0, 3.0]),
+            ("top", True, [2.0, 3.0, 1.0, 4.0]),
+            ("top", False, [2.0, 3.0, 1.0, 4.0]),
+        ],
+    )
+    def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
+        obj = frame_or_series(["foo", "foo", None, "foo"])
+        result = obj.rank(method="first", na_option=na_option, ascending=ascending)
+        expected = frame_or_series(expected)
+        tm.assert_equal(result, expected)
+
     @pytest.mark.parametrize(
         "data,expected",
         [

From ab9989e69462b5cd7d8c6e2a8c52c832c83d3c3c Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 10 Jun 2021 13:00:51 -0400
Subject: [PATCH 13/25] Add back nogil

---
 pandas/_libs/algos.pyx | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 3a33206384fa8..b1d05ee7ca677 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1449,21 +1449,22 @@ def rank_2d(
 
     # putmask doesn't accept a memoryview, so we assign as a separate step
     masked_vals = values
-    for col in range(k):
-        rank_sorted_1d(
-            out[:, col],
-            grp_sizes[:, col],
-            labels,
-            argsort_indexer[:, col],
-            masked_vals[:, col],
-            mask[:, col],
-            tiebreak,
-            check_mask,
-            False,
-            keep_na,
-            pct,
-            n,
-        )
+    with nogil:
+        for col in range(k):
+            rank_sorted_1d(
+                out[:, col],
+                grp_sizes[:, col],
+                labels,
+                argsort_indexer[:, col],
+                masked_vals[:, col],
+                mask[:, col],
+                tiebreak,
+                check_mask,
+                False,
+                keep_na,
+                pct,
+                n,
+            )
 
     if axis == 1:
         return np.array(out.T)

From 5ba6459ec6490548b7444c845dcea8d1fde6190f Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 10 Jun 2021 13:33:02 -0400
Subject: [PATCH 14/25] Add whatsnew

---
 doc/source/whatsnew/v1.3.0.rst |  2 ++
 pandas/_libs/algos.pyx         | 37 +++++++++++++++-------------------
 2 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index ff1c6ebf7aae2..674d6287be9ea 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -911,6 +911,8 @@ Numeric
 - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
 - Bug in :meth:`DataFrame.rank` when the DataFrame contained ``np.inf`` (:issue:`32593`)
 - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising an ``IndexError`` (:issue:`38932`)
+- Bug in :meth:`DataFrame.rank` raising ... with ``object`` columns and ``method="first"`` (:issue:`...`)
+- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`...`)
 - Bug in :meth:`Series.rank`, :meth:`DataFrame.rank`, and :meth:`.GroupBy.rank` treating the most negative ``int64`` value as missing (:issue:`32859`)
 - Bug in :meth:`DataFrame.select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36596`)
 - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed the argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index b1d05ee7ca677..77e068406cf45 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1378,7 +1378,7 @@ def rank_2d(
         ndarray[rank_t, ndim=2] values
         ndarray[rank_t, ndim=1] unused
         rank_t[:, :] masked_vals
-        intp_t[:, :] argsort_indexer
+        intp_t[:, :] sort_indexer
         uint8_t[:, :] mask
         TiebreakEnumType tiebreak
         bint check_mask, keep_na, nans_rank_highest
@@ -1406,29 +1406,20 @@ def rank_2d(
         if values.dtype != np.object_:
             values = values.astype('O')
 
-    if rank_t is object:
-        mask = missing.isnaobj2d(values)
-    elif rank_t is int64_t and is_datetimelike:
-        mask = (values == NPY_NAT).astype(np.uint8)
-    elif rank_t is float64_t:
-        mask = np.isnan(values).astype(np.uint8)
-    else:
-        mask = np.zeros_like(values, dtype=np.uint8)
-
     nans_rank_highest = ascending ^ (na_option == 'top')
     if check_mask:
         # For fused type specialization
         unused = values[:, 0]
         nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused)
 
-        # if rank_t is object:
-        #     mask = missing.isnaobj2d(values).view(np.uint8)
-        # elif rank_t is float64_t:
-        #     mask = np.isnan(values).view(np.uint8)
-        #
-        # # int64 and datetimelike
-        # else:
-        #     mask = (values == NPY_NAT).view(np.uint8)
+        if rank_t is object:
+            mask = missing.isnaobj2d(values).view(np.uint8)
+        elif rank_t is float64_t:
+            mask = np.isnan(values).view(np.uint8)
+
+        # int64 and datetimelike
+        else:
+            mask = (values == NPY_NAT).view(np.uint8)
         np.putmask(values, mask, nan_fill_val)
     else:
         mask = np.zeros_like(values, dtype=np.uint8)
@@ -1443,9 +1434,13 @@ def rank_2d(
     grp_sizes = np.ones((n, k), dtype='i8', order='F')
     labels = np.zeros(n, dtype=np.intp)
 
-    argsort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
+    if check_mask and not keep_na:
+        sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
+    else:
+        sort_indexer = values.argsort(axis=0).astype(np.intp, copy=False)
+
     if not ascending:
-        argsort_indexer = argsort_indexer[::-1, :]
+        sort_indexer = sort_indexer[::-1, :]
 
     # putmask doesn't accept a memoryview, so we assign as a separate step
     masked_vals = values
@@ -1455,7 +1450,7 @@ def rank_2d(
                 out[:, col],
                 grp_sizes[:, col],
                 labels,
-                argsort_indexer[:, col],
+                sort_indexer[:, col],
                 masked_vals[:, col],
                 mask[:, col],
                 tiebreak,

From 61540043a269e82841264e2a4524cc1b1b9c5579 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 10 Jun 2021 14:32:49 -0400
Subject: [PATCH 15/25] Cleaner fused type handling

---
 pandas/_libs/algos.pyx | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index c55d1e9898b79..7e6521deac052 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -931,7 +931,7 @@ ctypedef fused rank_t:
     int64_t
 
 
-cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, ndarray[rank_t, ndim=1] _):
+cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
     """
     Return the value we'll use to represent missing values when sorting depending
     on if we'd like missing values to end up at the top/bottom. (The second parameter
@@ -1053,7 +1053,7 @@ def rank_1d(
     # will flip the ordering to still end up with lowest rank.
     # Symmetric logic applies to `na_option == 'bottom'`
     nans_rank_highest = ascending ^ (na_option == 'top')
-    nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, masked_vals)
+    nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
     if nans_rank_highest:
         order = (masked_vals, mask, labels)
     else:
@@ -1376,7 +1376,6 @@ def rank_2d(
         Py_ssize_t infs
         ndarray[float64_t, ndim=2] ranks
         ndarray[rank_t, ndim=2] values
-        ndarray[rank_t, ndim=1] unused
         ndarray[intp_t, ndim=2] argsort_indexer
         ndarray[uint8_t, ndim=2] mask
         rank_t val, nan_fill_val
@@ -1385,9 +1384,6 @@ def rank_2d(
         int64_t idx
         bint check_mask, condition, keep_na, nans_rank_highest
 
-    if in_arr.shape[0] == 0 or in_arr.shape[1] == 0:
-        return np.empty_like(in_arr, dtype="f8")
-
     tiebreak = tiebreakers[ties_method]
 
     keep_na = na_option == 'keep'
@@ -1406,9 +1402,7 @@ def rank_2d(
 
     nans_rank_highest = ascending ^ (na_option == 'top')
     if check_mask:
-        # For fused type specialization
-        unused = values[:, 0]
-        nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused)
+        nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
 
         if rank_t is object:
             mask = missing.isnaobj2d(values)

From 0f8744da80722984c99cd1d5d1ec9ebbcab74b73 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 10 Jun 2021 15:03:40 -0400
Subject: [PATCH 16/25] Add comment

---
 pandas/_libs/algos.pyx | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 193d99108b74e..ed6ba52e4aede 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1428,12 +1428,17 @@ def rank_2d(
     grp_sizes = np.ones((n, k), dtype='i8', order='F')
     labels = np.zeros(n, dtype=np.intp)
 
-    sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
+    # lexsort is slower, so only use if we need to worry about the mask
+    if check_mask:
+        sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
+    else:
+        kind = "stable" if ties_method == "first" else None
+        sort_indexer = values.argsort(axis=0, kind=kind)
 
     if not ascending:
         sort_indexer = sort_indexer[::-1, :]
 
-    # putmask doesn't accept a memoryview, so we assign as a separate step
+    # putmask doesn't accept a memoryview, so we assign in a separate step
     masked_vals = values
     with nogil:
         for col in range(k):

From d47f2a6abb80d95affa3b885df7609a51bc8f8ea Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 10 Jun 2021 15:18:58 -0400
Subject: [PATCH 17/25] Update whatsnew

---
 doc/source/whatsnew/v1.3.0.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 674d6287be9ea..54411724a6709 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -911,8 +911,8 @@ Numeric
 - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
 - Bug in :meth:`DataFrame.rank` when the DataFrame contained ``np.inf`` (:issue:`32593`)
 - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising an ``IndexError`` (:issue:`38932`)
-- Bug in :meth:`DataFrame.rank` raising ... with ``object`` columns and ``method="first"`` (:issue:`...`)
-- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`...`)
+- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`)
+- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
 - Bug in :meth:`Series.rank`, :meth:`DataFrame.rank`, and :meth:`.GroupBy.rank` treating the most negative ``int64`` value as missing (:issue:`32859`)
 - Bug in :meth:`DataFrame.select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36596`)
 - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed the argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)

From da61fb8dee12953f8714a4b164cd95a10aaac47e Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 10 Jun 2021 17:03:27 -0400
Subject: [PATCH 18/25] Try 32-bit fix

---
 pandas/_libs/algos.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index ed6ba52e4aede..ca57d7d686925 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1433,7 +1433,7 @@ def rank_2d(
         sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
     else:
         kind = "stable" if ties_method == "first" else None
-        sort_indexer = values.argsort(axis=0, kind=kind)
+        sort_indexer = values.argsort(axis=0, kind=kind).astype(np.intp, copy=False)
 
     if not ascending:
         sort_indexer = sort_indexer[::-1, :]

From e2d96179fcbe2cf02ab77fa19860eaebe40d1dab Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 10 Jun 2021 19:09:27 -0400
Subject: [PATCH 19/25] Debug 32-bit

---
 pandas/_libs/algos.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index ca57d7d686925..8a3b5ac77ced8 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1091,8 +1091,6 @@ def rank_1d(
     return np.array(out)
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
 cdef void rank_sorted_1d(
     float64_t[::1] out,
     int64_t[::1] grp_sizes,

From b4d11a410065bd0f9b903637cab8d955770a9c48 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 10 Jun 2021 20:11:28 -0400
Subject: [PATCH 20/25] Debug 32-bit

---
 pandas/_libs/algos.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 8a3b5ac77ced8..ca57d7d686925 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1091,6 +1091,8 @@ def rank_1d(
     return np.array(out)
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
 cdef void rank_sorted_1d(
     float64_t[::1] out,
     int64_t[::1] grp_sizes,

From 1e47daec7ed59dabb78006b3eef8251d8d47aa03 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 17 Jun 2021 10:46:43 -0700
Subject: [PATCH 21/25] Move whatsnew

---
 doc/source/whatsnew/v1.3.0.rst | 2 --
 doc/source/whatsnew/v1.4.0.rst | 3 ++-
 pandas/_libs/algos.pyx         | 5 +++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 7d71de9a4f261..6c2fef3808566 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -941,8 +941,6 @@ Numeric
 - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
 - Bug in :meth:`DataFrame.rank` when the DataFrame contained ``np.inf`` (:issue:`32593`)
 - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising an ``IndexError`` (:issue:`38932`)
-- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`)
-- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
 - Bug in :meth:`Series.rank`, :meth:`DataFrame.rank`, and :meth:`.GroupBy.rank` treating the most negative ``int64`` value as missing (:issue:`32859`)
 - Bug in :meth:`DataFrame.select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36596`)
 - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed the argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 166ea2f0d4164..d748fcff14c61 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -137,7 +137,8 @@ Timezones
 
 Numeric
 ^^^^^^^
--
+- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`)
+- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
 -
 
 Conversion
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index d776111f177bc..a026cbe447c19 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1261,6 +1261,7 @@ cdef void rank_sorted_1d(
             # when either of those change. Used to calculate tiebreakers
             dups += 1
             sum_ranks += i - grp_start + 1
+
             next_val_diff = at_end or (masked_vals[sort_indexer[i]]
                                        != masked_vals[sort_indexer[i+1]])
 
@@ -1458,9 +1459,9 @@ def rank_2d(
             )
 
     if axis == 1:
-        return np.array(out.T)
+        return np.asarray(out.T)
     else:
-        return np.array(out)
+        return np.asarray(out)
 
 
 ctypedef fused diff_t:

From 8d038af23692e1bc9543b811ae0d0753153f2e00 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 17 Jun 2021 16:07:30 -0700
Subject: [PATCH 22/25] WIP

---
 pandas/_libs/algos.pyi     |   2 +-
 pandas/_libs/algos.pyx     | 243 ++++++++++++++++++++-----------------
 pandas/core/algorithms.py  |   1 -
 pandas/tests/test_algos.py |   2 +-
 4 files changed, 135 insertions(+), 113 deletions(-)

diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi
index d0f664c323a89..c398d8d45c5b8 100644
--- a/pandas/_libs/algos.pyi
+++ b/pandas/_libs/algos.pyi
@@ -123,7 +123,7 @@ def is_monotonic(
 
 def rank_1d(
     values: np.ndarray,  # ndarray[rank_t, ndim=1]
-    labels: np.ndarray,  # const int64_t[:]
+    labels: np.ndarray | None,  # const int64_t[:]=None
     is_datetimelike: bool = ...,
     ties_method=...,
     ascending: bool = ...,
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index a026cbe447c19..6f77ae214e0d2 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -389,11 +389,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
         int64_t nobs = 0
         bint no_nans
         float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
-        const int64_t[:] labels_n, labels_nobs
 
     N, K = (<object>mat).shape
-    # For compatibility when calling rank_1d
-    labels_n = np.zeros(N, dtype=np.int64)
 
     # Handle the edge case where we know all results will be nan
     # to keep conditional logic inside loop simpler
@@ -412,7 +409,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
     maskedx = np.empty(N, dtype=np.float64)
     maskedy = np.empty(N, dtype=np.float64)
     for i in range(K):
-        ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n)
+        ranked_mat[:, i] = rank_1d(mat[:, i])
 
     with nogil:
         for xi in range(K):
@@ -451,11 +448,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
                             with gil:
                                 # We need to slice back to nobs because rank_1d will
                                 # require arrays of nobs length
-                                labels_nobs = np.zeros(nobs, dtype=np.int64)
-                                rankedx = rank_1d(np.array(maskedx)[:nobs],
-                                                  labels=labels_nobs)
-                                rankedy = rank_1d(np.array(maskedy)[:nobs],
-                                                  labels=labels_nobs)
+                                rankedx = rank_1d(np.array(maskedx)[:nobs])
+                                rankedy = rank_1d(np.array(maskedy)[:nobs])
                             for i in range(nobs):
                                 maskedx[i] = rankedx[i]
                                 maskedy[i] = rankedy[i]
@@ -518,7 +512,6 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
         int64_t total_discordant = 0
         float64_t kendall_tau
         int64_t n_obs
-        const intp_t[:] labels_n
 
     N, K = (<object>mat).shape
 
@@ -526,11 +519,9 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
     mask = np.isfinite(mat)
 
     ranked_mat = np.empty((N, K), dtype=np.float64)
-    # For compatibility when calling rank_1d
-    labels_n = np.zeros(N, dtype=np.intp)
 
     for i in range(K):
-        ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
+        ranked_mat[:, i] = rank_1d(mat[:, i])
 
     for xi in range(K):
         sorted_idxs = ranked_mat[:, xi].argsort()
@@ -961,7 +952,7 @@ cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
 @cython.boundscheck(False)
 def rank_1d(
     ndarray[rank_t, ndim=1] values,
-    const intp_t[:] labels,
+    const intp_t[:] labels=None,
     bint is_datetimelike=False,
     ties_method="average",
     bint ascending=True,
@@ -974,10 +965,10 @@ def rank_1d(
     Parameters
     ----------
     values : array of rank_t values to be ranked
-    labels : np.ndarray[np.intp]
+    labels : np.ndarray[np.intp] or None
         Array containing unique label for each group, with its ordering
         matching up to the corresponding record in `values`. If not called
-        from a groupby operation, will be an array of 0's
+        from a groupby operation, will be None.
     is_datetimelike : bool, default False
         True if `values` contains datetime-like entries.
     ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
@@ -1000,12 +991,12 @@ def rank_1d(
     cdef:
         TiebreakEnumType tiebreak
         Py_ssize_t N
-        int64_t[::1] grp_sizes
-        intp_t[:] lexsort_indexer
+        intp_t[:] sort_indexer
         float64_t[::1] out
         ndarray[rank_t, ndim=1] masked_vals
         rank_t[:] masked_vals_memview
-        uint8_t[:] mask
+        int64_t[::1] grp_sizes=None
+        uint8_t[:] mask=None
         bint keep_na, nans_rank_highest, check_labels, check_mask
         rank_t nan_fill_val
 
@@ -1017,17 +1008,18 @@ def rank_1d(
     keep_na = na_option == 'keep'
 
     N = len(values)
-    # TODO Cython 3.0: cast won't be necessary (#2992)
-    assert <Py_ssize_t>len(labels) == N
+    if labels is not None:
+        # TODO Cython 3.0: cast won't be necessary (#2992)
+        assert <Py_ssize_t>len(labels) == N
     out = np.empty(N)
-    grp_sizes = np.ones(N, dtype=np.int64)
 
-    # If all 0 labels, can short-circuit later label
+    # If we don't care about labels, can short-circuit later label
     # comparisons
-    check_labels = np.any(labels)
+    check_labels = labels is not None
 
-    # For cases where a mask is not possible, we can avoid mask checks
-    check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
+    # If this doesn't hold, we don't care about group sizes, so don't even allocate
+    if pct and (tiebreak == TIEBREAK_DENSE or check_labels):
+        grp_sizes = np.ones(N, dtype=np.int64)
 
     # Copy values into new array in order to fill missing data
     # with mask, without obfuscating location of missing data
@@ -1043,49 +1035,69 @@ def rank_1d(
         mask = (masked_vals == NPY_NAT).astype(np.uint8)
     elif rank_t is float64_t:
         mask = np.isnan(masked_vals).astype(np.uint8)
+
+    # For cases where a mask is not possible, we can avoid mask checks
+    check_mask = mask is not None
+
+    if check_mask:
+        # If `na_option == 'top'`, we want to assign the lowest rank
+        # to NaN regardless of ascending/descending. So if ascending,
+        # fill with lowest value of type to end up with lowest rank.
+        # If descending, fill with highest value since descending
+        # will flip the ordering to still end up with lowest rank.
+        # Symmetric logic applies to `na_option == 'bottom'`
+        nans_rank_highest = ascending ^ (na_option == 'top')
+        nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
+        np.putmask(masked_vals, mask, nan_fill_val)
+
+    # Depending on whether we care about labels and masks, we need
+    # different sorting criteria
+
+    if check_mask and check_labels:
+        # lexsort using labels, then mask, then actual values
+        # each label corresponds to a different group value,
+        # the mask helps you differentiate missing values before
+        # performing sort on the actual values
+        if nans_rank_highest:
+            order = (masked_vals, mask, labels)
+        else:
+            order = (masked_vals, ~(np.asarray(mask)), labels)
+    elif check_mask:
+        if nans_rank_highest:
+            order = (masked_vals, mask)
+        else:
+            order = (masked_vals, ~(np.asarray(mask)))
+    elif check_labels:
+        order = (masked_vals, labels)
     else:
-        mask = np.zeros(shape=len(masked_vals), dtype=np.uint8)
-
-    # If `na_option == 'top'`, we want to assign the lowest rank
-    # to NaN regardless of ascending/descending. So if ascending,
-    # fill with lowest value of type to end up with lowest rank.
-    # If descending, fill with highest value since descending
-    # will flip the ordering to still end up with lowest rank.
-    # Symmetric logic applies to `na_option == 'bottom'`
-    nans_rank_highest = ascending ^ (na_option == 'top')
-    nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
-    if nans_rank_highest:
-        order = (masked_vals, mask, labels)
+        order = None
+
+    # lexsort is slower, so only use if we actually need to sort on multiple keys
+    if order is not None:
+        sort_indexer = np.lexsort(order).astype(np.intp, copy=False)
     else:
-        order = (masked_vals, ~(np.asarray(mask)), labels)
+        kind = "stable" if ties_method == "first" else None
+        sort_indexer = masked_vals.argsort(kind=kind).astype(np.intp, copy=False)
+        # print(np.array(sort_indexer))
 
-    np.putmask(masked_vals, mask, nan_fill_val)
     # putmask doesn't accept a memoryview, so we assign as a separate step
     masked_vals_memview = masked_vals
 
-    # lexsort using labels, then mask, then actual values
-    # each label corresponds to a different group value,
-    # the mask helps you differentiate missing values before
-    # performing sort on the actual values
-    lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False)
-
     if not ascending:
-        lexsort_indexer = lexsort_indexer[::-1]
+        sort_indexer = sort_indexer[::-1]
 
     with nogil:
         rank_sorted_1d(
             out,
-            grp_sizes,
-            labels,
-            lexsort_indexer,
             masked_vals_memview,
-            mask,
-            tiebreak,
-            check_mask,
-            check_labels,
-            keep_na,
-            pct,
+            sort_indexer,
             N,
+            mask=mask,
+            grp_sizes=grp_sizes,
+            tiebreak=tiebreak,
+            keep_na=keep_na,
+            pct=pct,
+            labels=labels,
         )
 
     return np.asarray(out)
@@ -1095,18 +1107,18 @@ def rank_1d(
 @cython.boundscheck(False)
 cdef void rank_sorted_1d(
     float64_t[::1] out,
-    int64_t[::1] grp_sizes,
-    const intp_t[:] labels,
-    const intp_t[:] sort_indexer,
     # Can make const with cython3 (https://github.com/cython/cython/issues/3222)
     rank_t[:] masked_vals,
-    const uint8_t[:] mask,
-    TiebreakEnumType tiebreak,
-    bint check_mask,
-    bint check_labels,
-    bint keep_na,
-    bint pct,
+    const intp_t[:] sort_indexer,
     Py_ssize_t N,
+    const uint8_t[:] mask=None,
+    int64_t[::1] grp_sizes=None,
+    TiebreakEnumType tiebreak=TIEBREAK_AVERAGE,
+    bint keep_na=True,
+    bint pct=False,
+    # https://github.com/cython/cython/issues/1630, only trailing arguments can
+    # currently be omitted for cdef functions, which is why we keep these at the end
+    const intp_t[:] labels=None,
 ) nogil:
     """
     See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should
@@ -1116,36 +1128,40 @@ cdef void rank_sorted_1d(
     ----------
     out : float64_t[::1]
         Array to store computed ranks
-    grp_sizes : int64_t[::1]
-        Array to store group counts, only used if pct=True
-    labels : See rank_1d.__doc__
-    sort_indexer : intp_t[:]
-        Array of indices which sorts masked_vals
     masked_vals : rank_t[:]
         The values input to rank_1d, with missing values replaced by fill values
-    mask : uint8_t[:]
-        Array where entries are True if the value is missing, False otherwise
-    tiebreak : TiebreakEnumType
-        See rank_1d.__doc__ for the different modes
-    check_mask : bool
-        If False, assumes the mask is all False to skip mask indexing
-    check_labels : bool
-        If False, assumes all labels are the same to skip group handling logic
-    keep_na : bool
-        Whether or not to keep nulls
-    pct : bool
-        Compute percentage rank of data within each group
+    sort_indexer : intp_t[:]
+        Array of indices which sorts masked_vals
     N : Py_ssize_t
         The number of elements to rank. Note: it is not always true that
         N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
+    mask : uint8_t[:], default None
+        Array where entries are True if the value is missing, False otherwise. None
+        implies the mask is all False
+    grp_sizes : int64_t[::1], default None
+        Array to store group counts, only used if pct=True. Should only be None
+        if labels is None.
+    tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE
+        See rank_1d.__doc__ for the different modes
+    keep_na : bool, default True
+        Whether or not to keep nulls
+    pct : bool, default False
+        Compute percentage rank of data within each group
+    labels : See rank_1d.__doc__, default None. None implies all labels are the same.
     """
 
     cdef:
         Py_ssize_t i, j, dups=0, sum_ranks=0,
         Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0
-        bint at_end, next_val_diff, group_changed
+        bint at_end, next_val_diff, group_changed, check_mask, check_labels
+        bint grp_size_needed
         int64_t grp_size
 
+    check_mask = mask is not None
+    check_labels = labels is not None
+    # Group size only needs to be tracked if we have groups or are doing dense ranking
+    grp_size_needed = pct and (check_labels or tiebreak == TIEBREAK_DENSE)
+
     # Loop over the length of the value array
     # each incremental i value can be looked up in the lexsort_indexer
     # array that we sorted previously, which gives us the location of
@@ -1245,8 +1261,9 @@ cdef void rank_sorted_1d(
                         else:
                             grp_size = grp_vals_seen - (grp_na_count > 0)
 
-                        for j in range(grp_start, i + 1):
-                            grp_sizes[sort_indexer[j]] = grp_size
+                        if grp_size_needed:
+                            for j in range(grp_start, i + 1):
+                                grp_sizes[sort_indexer[j]] = grp_size
 
                         dups = sum_ranks = 0
                         grp_na_count = 0
@@ -1345,8 +1362,9 @@ cdef void rank_sorted_1d(
                     else:
                         grp_size = grp_vals_seen - (grp_na_count > 0)
 
-                    for j in range(grp_start, i + 1):
-                        grp_sizes[sort_indexer[j]] = grp_size
+                    if grp_size_needed:
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[sort_indexer[j]] = grp_size
 
                     dups = sum_ranks = 0
                     grp_na_count = 0
@@ -1354,9 +1372,15 @@ cdef void rank_sorted_1d(
                     grp_vals_seen = 1
 
     if pct:
+        # If we're grouping, use the computed group sizes, otherwise we can just
+        # use the data length
+
         for i in range(N):
-            if grp_sizes[i] != 0:
-                out[i] = out[i] / grp_sizes[i]
+            if grp_size_needed:
+                if grp_sizes[i] != 0:
+                    out[i] = out[i] / grp_sizes[i]
+            else:
+                out[i] = out[i] / N
 
 
 def rank_2d(
@@ -1374,12 +1398,12 @@ def rank_2d(
     cdef:
         Py_ssize_t k, n, col
         float64_t[::1, :] out  # Column-major so columns are contiguous
-        int64_t[::1, :] grp_sizes
-        const intp_t[:] labels
         ndarray[rank_t, ndim=2] values
         rank_t[:, :] masked_vals
         intp_t[:, :] sort_indexer
-        uint8_t[:, :] mask
+        uint8_t[:, :] mask=None
+        uint8_t[:] mask_arg=None
+        int64_t[::1] grp_sizes=None
         TiebreakEnumType tiebreak
         bint check_mask, keep_na, nans_rank_highest
         rank_t nan_fill_val
@@ -1416,21 +1440,21 @@ def rank_2d(
         else:
             mask = (values == NPY_NAT).view(np.uint8)
         np.putmask(values, mask, nan_fill_val)
-    else:
-        mask = np.zeros_like(values, dtype=np.uint8)
-
-    if nans_rank_highest:
-        order = (values, mask)
-    else:
-        order = (values, ~np.asarray(mask))
 
     n, k = (<object>values).shape
     out = np.empty((n, k), dtype='f8', order='F')
-    grp_sizes = np.ones((n, k), dtype='i8', order='F')
-    labels = np.zeros(n, dtype=np.intp)
+
+    # If this doesn't hold, we don't care about group sizes, so don't even allocate
+    if pct and tiebreak == TIEBREAK_DENSE:
+        grp_sizes = np.ones(n, dtype=np.int64)
 
     # lexsort is slower, so only use if we need to worry about the mask
     if check_mask:
+        if nans_rank_highest:
+            order = (values, mask)
+        else:
+            order = (values, ~np.asarray(mask))
+
         sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
     else:
         kind = "stable" if ties_method == "first" else None
@@ -1443,19 +1467,18 @@ def rank_2d(
     masked_vals = values
     with nogil:
         for col in range(k):
+            if mask is not None:
+                mask_arg = mask[:, col]
             rank_sorted_1d(
                 out[:, col],
-                grp_sizes[:, col],
-                labels,
-                sort_indexer[:, col],
                 masked_vals[:, col],
-                mask[:, col],
-                tiebreak,
-                check_mask,
-                False,
-                keep_na,
-                pct,
+                sort_indexer[:, col],
                 n,
+                mask=mask_arg,
+                grp_sizes=grp_sizes,
+                tiebreak=tiebreak,
+                keep_na=keep_na,
+                pct=pct,
             )
 
     if axis == 1:
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 7dcc83f76db75..2d108b599bbaf 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1008,7 +1008,6 @@ def rank(
     if values.ndim == 1:
         ranks = algos.rank_1d(
             values,
-            labels=np.zeros(len(values), dtype=np.intp),
             is_datetimelike=is_datetimelike,
             ties_method=method,
             ascending=ascending,
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 4df95d895e475..b4836dffffa06 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1747,7 +1747,7 @@ def test_scipy_compat(self):
         def _check(arr):
             mask = ~np.isfinite(arr)
             arr = arr.copy()
-            result = libalgos.rank_1d(arr, labels=np.zeros(len(arr), dtype=np.intp))
+            result = libalgos.rank_1d(arr)
             arr[mask] = np.inf
             exp = rankdata(arr)
             exp[mask] = np.nan

From f90b8d9726b6649dcb49be2a55da6541d377f913 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 17 Jun 2021 16:10:55 -0700
Subject: [PATCH 23/25] Clean up group sizes

---
 pandas/_libs/algos.pyx | 51 +++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 35 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 6f77ae214e0d2..9d55535b55e2e 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -995,7 +995,7 @@ def rank_1d(
         float64_t[::1] out
         ndarray[rank_t, ndim=1] masked_vals
         rank_t[:] masked_vals_memview
-        int64_t[::1] grp_sizes=None
+        int64_t[::1] grp_sizes
         uint8_t[:] mask=None
         bint keep_na, nans_rank_highest, check_labels, check_mask
         rank_t nan_fill_val
@@ -1012,15 +1012,12 @@ def rank_1d(
         # TODO Cython 3.0: cast won't be necessary (#2992)
         assert <Py_ssize_t>len(labels) == N
     out = np.empty(N)
+    grp_sizes = np.ones(N, dtype=np.int64)
 
     # If we don't care about labels, can short-circuit later label
     # comparisons
     check_labels = labels is not None
 
-    # If this doesn't hold, we don't care about group sizes, so don't even allocate
-    if pct and (tiebreak == TIEBREAK_DENSE or check_labels):
-        grp_sizes = np.ones(N, dtype=np.int64)
-
     # Copy values into new array in order to fill missing data
     # with mask, without obfuscating location of missing data
     # in values array
@@ -1052,7 +1049,6 @@ def rank_1d(
 
     # Depending on whether we care about labels and masks, we need
     # different sorting criteria
-
     if check_mask and check_labels:
         # lexsort using labels, then mask, then actual values
         # each label corresponds to a different group value,
@@ -1078,7 +1074,6 @@ def rank_1d(
     else:
         kind = "stable" if ties_method == "first" else None
         sort_indexer = masked_vals.argsort(kind=kind).astype(np.intp, copy=False)
-        # print(np.array(sort_indexer))
 
     # putmask doesn't accept a memoryview, so we assign as a separate step
     masked_vals_memview = masked_vals
@@ -1091,9 +1086,9 @@ def rank_1d(
             out,
             masked_vals_memview,
             sort_indexer,
+            grp_sizes,
             N,
             mask=mask,
-            grp_sizes=grp_sizes,
             tiebreak=tiebreak,
             keep_na=keep_na,
             pct=pct,
@@ -1110,9 +1105,9 @@ cdef void rank_sorted_1d(
     # Can make const with cython3 (https://github.com/cython/cython/issues/3222)
     rank_t[:] masked_vals,
     const intp_t[:] sort_indexer,
+    int64_t[::1] grp_sizes,
     Py_ssize_t N,
     const uint8_t[:] mask=None,
-    int64_t[::1] grp_sizes=None,
     TiebreakEnumType tiebreak=TIEBREAK_AVERAGE,
     bint keep_na=True,
     bint pct=False,
@@ -1132,15 +1127,15 @@ cdef void rank_sorted_1d(
         The values input to rank_1d, with missing values replaced by fill values
     sort_indexer : intp_t[:]
         Array of indices which sorts masked_vals
+    grp_sizes : int64_t[::1]
+        Array to store group counts, only used if pct=True. Should only be None
+        if labels is None.
     N : Py_ssize_t
         The number of elements to rank. Note: it is not always true that
         N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
     mask : uint8_t[:], default None
         Array where entries are True if the value is missing, False otherwise. None
         implies the mask is all False
-    grp_sizes : int64_t[::1], default None
-        Array to store group counts, only used if pct=True. Should only be None
-        if labels is None.
     tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE
         See rank_1d.__doc__ for the different modes
     keep_na : bool, default True
@@ -1154,13 +1149,10 @@ cdef void rank_sorted_1d(
         Py_ssize_t i, j, dups=0, sum_ranks=0,
         Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0
         bint at_end, next_val_diff, group_changed, check_mask, check_labels
-        bint grp_size_needed
         int64_t grp_size
 
     check_mask = mask is not None
     check_labels = labels is not None
-    # Group size only needs to be tracked if we have groups or are doing dense ranking
-    grp_size_needed = pct and (check_labels or tiebreak == TIEBREAK_DENSE)
 
     # Loop over the length of the value array
     # each incremental i value can be looked up in the lexsort_indexer
@@ -1261,9 +1253,8 @@ cdef void rank_sorted_1d(
                         else:
                             grp_size = grp_vals_seen - (grp_na_count > 0)
 
-                        if grp_size_needed:
-                            for j in range(grp_start, i + 1):
-                                grp_sizes[sort_indexer[j]] = grp_size
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[sort_indexer[j]] = grp_size
 
                         dups = sum_ranks = 0
                         grp_na_count = 0
@@ -1362,9 +1353,8 @@ cdef void rank_sorted_1d(
                     else:
                         grp_size = grp_vals_seen - (grp_na_count > 0)
 
-                    if grp_size_needed:
-                        for j in range(grp_start, i + 1):
-                            grp_sizes[sort_indexer[j]] = grp_size
+                    for j in range(grp_start, i + 1):
+                        grp_sizes[sort_indexer[j]] = grp_size
 
                     dups = sum_ranks = 0
                     grp_na_count = 0
@@ -1372,15 +1362,9 @@ cdef void rank_sorted_1d(
                     grp_vals_seen = 1
 
     if pct:
-        # If we're grouping, use the computed group sizes, otherwise we can just
-        # use the data length
-
         for i in range(N):
-            if grp_size_needed:
-                if grp_sizes[i] != 0:
-                    out[i] = out[i] / grp_sizes[i]
-            else:
-                out[i] = out[i] / N
+            if grp_sizes[i] != 0:
+                out[i] = out[i] / grp_sizes[i]
 
 
 def rank_2d(
@@ -1401,9 +1385,9 @@ def rank_2d(
         ndarray[rank_t, ndim=2] values
         rank_t[:, :] masked_vals
         intp_t[:, :] sort_indexer
+        int64_t[::1] grp_sizes
         uint8_t[:, :] mask=None
         uint8_t[:] mask_arg=None
-        int64_t[::1] grp_sizes=None
         TiebreakEnumType tiebreak
         bint check_mask, keep_na, nans_rank_highest
         rank_t nan_fill_val
@@ -1443,10 +1427,7 @@ def rank_2d(
 
     n, k = (<object>values).shape
     out = np.empty((n, k), dtype='f8', order='F')
-
-    # If this doesn't hold, we don't care about group sizes, so don't even allocate
-    if pct and tiebreak == TIEBREAK_DENSE:
-        grp_sizes = np.ones(n, dtype=np.int64)
+    grp_sizes = np.ones(n, dtype=np.int64)
 
     # lexsort is slower, so only use if we need to worry about the mask
     if check_mask:
@@ -1473,9 +1454,9 @@ def rank_2d(
                 out[:, col],
                 masked_vals[:, col],
                 sort_indexer[:, col],
+                grp_sizes,
                 n,
                 mask=mask_arg,
-                grp_sizes=grp_sizes,
                 tiebreak=tiebreak,
                 keep_na=keep_na,
                 pct=pct,

From d9b234216cd436e2c1381768df35971e221bbf7b Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sat, 26 Jun 2021 15:20:24 -0400
Subject: [PATCH 24/25] Fixups

---
 pandas/_libs/algos.pyi |   2 +-
 pandas/_libs/algos.pyx | 136 ++++++++++++++++++-----------------------
 2 files changed, 62 insertions(+), 76 deletions(-)

diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi
index c398d8d45c5b8..9da5534c51321 100644
--- a/pandas/_libs/algos.pyi
+++ b/pandas/_libs/algos.pyi
@@ -123,7 +123,7 @@ def is_monotonic(
 
 def rank_1d(
     values: np.ndarray,  # ndarray[rank_t, ndim=1]
-    labels: np.ndarray | None,  # const int64_t[:]=None
+    labels: np.ndarray | None = ...,  # const int64_t[:]=None
     is_datetimelike: bool = ...,
     ties_method=...,
     ascending: bool = ...,
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index a3aea22073803..1bfbe49e0a82f 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -991,12 +991,12 @@ def rank_1d(
     cdef:
         TiebreakEnumType tiebreak
         Py_ssize_t N
-        intp_t[:] sort_indexer
+        int64_t[::1] grp_sizes
+        intp_t[:] lexsort_indexer
         float64_t[::1] out
         ndarray[rank_t, ndim=1] masked_vals
         rank_t[:] masked_vals_memview
-        int64_t[::1] grp_sizes
-        uint8_t[:] mask=None
+        uint8_t[:] mask
         bint keep_na, nans_rank_highest, check_labels, check_mask
         rank_t nan_fill_val
 
@@ -1018,6 +1018,9 @@ def rank_1d(
     # comparisons
     check_labels = labels is not None
 
+    # For cases where a mask is not possible, we can avoid mask checks
+    check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
+
     # Copy values into new array in order to fill missing data
     # with mask, without obfuscating location of missing data
     # in values array
@@ -1032,63 +1035,47 @@ def rank_1d(
         mask = (masked_vals == NPY_NAT).astype(np.uint8)
     elif rank_t is float64_t:
         mask = np.isnan(masked_vals).astype(np.uint8)
-
-    # For cases where a mask is not possible, we can avoid mask checks
-    check_mask = mask is not None
-
-    if check_mask:
-        # If `na_option == 'top'`, we want to assign the lowest rank
-        # to NaN regardless of ascending/descending. So if ascending,
-        # fill with lowest value of type to end up with lowest rank.
-        # If descending, fill with highest value since descending
-        # will flip the ordering to still end up with lowest rank.
-        # Symmetric logic applies to `na_option == 'bottom'`
-        nans_rank_highest = ascending ^ (na_option == 'top')
-        nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
-        np.putmask(masked_vals, mask, nan_fill_val)
-
-    # Depending on whether we care about labels and masks, we need
-    # different sorting criteria
-    if check_mask and check_labels:
-        # lexsort using labels, then mask, then actual values
-        # each label corresponds to a different group value,
-        # the mask helps you differentiate missing values before
-        # performing sort on the actual values
-        if nans_rank_highest:
-            order = (masked_vals, mask, labels)
-        else:
-            order = (masked_vals, ~(np.asarray(mask)), labels)
-    elif check_mask:
-        if nans_rank_highest:
-            order = (masked_vals, mask)
-        else:
-            order = (masked_vals, ~(np.asarray(mask)))
-    elif check_labels:
-        order = (masked_vals, labels)
     else:
-        order = None
-
-    # lexsort is slower, so only use if we actually need to sort on multiple keys
-    if order is not None:
-        sort_indexer = np.lexsort(order).astype(np.intp, copy=False)
+        mask = np.zeros(shape=len(masked_vals), dtype=np.uint8)
+
+    # If `na_option == 'top'`, we want to assign the lowest rank
+    # to NaN regardless of ascending/descending. So if ascending,
+    # fill with lowest value of type to end up with lowest rank.
+    # If descending, fill with highest value since descending
+    # will flip the ordering to still end up with lowest rank.
+    # Symmetric logic applies to `na_option == 'bottom'`
+    nans_rank_highest = ascending ^ (na_option == 'top')
+    nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
+    if nans_rank_highest:
+        order = [masked_vals, mask]
     else:
-        kind = "stable" if ties_method == "first" else None
-        sort_indexer = masked_vals.argsort(kind=kind).astype(np.intp, copy=False)
+        order = [masked_vals, ~(np.asarray(mask))]
+
+    if check_labels:
+        order.append(labels)
 
+    np.putmask(masked_vals, mask, nan_fill_val)
     # putmask doesn't accept a memoryview, so we assign as a separate step
     masked_vals_memview = masked_vals
 
+    # lexsort using labels, then mask, then actual values
+    # each label corresponds to a different group value,
+    # the mask helps you differentiate missing values before
+    # performing sort on the actual values
+    lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False)
+
     if not ascending:
-        sort_indexer = sort_indexer[::-1]
+        lexsort_indexer = lexsort_indexer[::-1]
 
     with nogil:
         rank_sorted_1d(
             out,
-            masked_vals_memview,
-            sort_indexer,
             grp_sizes,
+            lexsort_indexer,
+            masked_vals_memview,
+            mask,
+            check_mask,
             N,
-            mask=mask,
             tiebreak=tiebreak,
             keep_na=keep_na,
             pct=pct,
@@ -1102,17 +1089,18 @@ def rank_1d(
 @cython.boundscheck(False)
 cdef void rank_sorted_1d(
     float64_t[::1] out,
+    int64_t[::1] grp_sizes,
+    const intp_t[:] sort_indexer,
     # Can make const with cython3 (https://github.com/cython/cython/issues/3222)
     rank_t[:] masked_vals,
-    const intp_t[:] sort_indexer,
-    int64_t[::1] grp_sizes,
+    const uint8_t[:] mask,
+    bint check_mask,
     Py_ssize_t N,
-    const uint8_t[:] mask=None,
     TiebreakEnumType tiebreak=TIEBREAK_AVERAGE,
     bint keep_na=True,
     bint pct=False,
     # https://github.com/cython/cython/issues/1630, only trailing arguments can
-    # currently be omitted for cdef functions, which is why we keep these at the end
+    # currently be omitted for cdef functions, which is why we keep this at the end
     const intp_t[:] labels=None,
 ) nogil:
     """
@@ -1123,19 +1111,20 @@ cdef void rank_sorted_1d(
     ----------
     out : float64_t[::1]
         Array to store computed ranks
-    masked_vals : rank_t[:]
-        The values input to rank_1d, with missing values replaced by fill values
-    sort_indexer : intp_t[:]
-        Array of indices which sorts masked_vals
     grp_sizes : int64_t[::1]
         Array to store group counts, only used if pct=True. Should only be None
         if labels is None.
+    sort_indexer : intp_t[:]
+        Array of indices which sorts masked_vals
+    masked_vals : rank_t[:]
+        The values input to rank_1d, with missing values replaced by fill values
+    mask : uint8_t[:]
+        Array where entries are True if the value is missing, False otherwise.
+    check_mask : bool
+        If False, assumes the mask is all False to skip mask indexing
     N : Py_ssize_t
         The number of elements to rank. Note: it is not always true that
         N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
-    mask : uint8_t[:], default None
-        Array where entries are True if the value is missing, False otherwise. None
-        implies the mask is all False
     tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE
         See rank_1d.__doc__ for the different modes
     keep_na : bool, default True
@@ -1148,10 +1137,9 @@ cdef void rank_sorted_1d(
     cdef:
         Py_ssize_t i, j, dups=0, sum_ranks=0,
         Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0
-        bint at_end, next_val_diff, group_changed, check_mask, check_labels
+        bint at_end, next_val_diff, group_changed, check_labels
         int64_t grp_size
 
-    check_mask = mask is not None
     check_labels = labels is not None
 
     # Loop over the length of the value array
@@ -1382,12 +1370,11 @@ def rank_2d(
     cdef:
         Py_ssize_t k, n, col
         float64_t[::1, :] out  # Column-major so columns are contiguous
+        int64_t[::1] grp_sizes
         ndarray[rank_t, ndim=2] values
         rank_t[:, :] masked_vals
         intp_t[:, :] sort_indexer
-        int64_t[::1] grp_sizes
-        uint8_t[:, :] mask=None
-        uint8_t[:] mask_arg=None
+        uint8_t[:, :] mask
         TiebreakEnumType tiebreak
         bint check_mask, keep_na, nans_rank_highest
         rank_t nan_fill_val
@@ -1424,18 +1411,18 @@ def rank_2d(
         else:
             mask = (values == NPY_NAT).view(np.uint8)
         np.putmask(values, mask, nan_fill_val)
+    else:
+        mask = np.zeros_like(values, dtype=np.uint8)
+
+    if nans_rank_highest:
+        order = (values, mask)
+    else:
+        order = (values, ~np.asarray(mask))
 
     n, k = (<object>values).shape
     out = np.empty((n, k), dtype='f8', order='F')
     grp_sizes = np.ones(n, dtype=np.int64)
 
-    # lexsort is slower, so only use if we need to worry about the mask
-    if check_mask:
-        if nans_rank_highest:
-            order = (values, mask)
-        else:
-            order = (values, ~np.asarray(mask))
-
     # lexsort is slower, so only use if we need to worry about the mask
     if check_mask:
         sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
@@ -1450,15 +1437,14 @@ def rank_2d(
     masked_vals = values
     with nogil:
         for col in range(k):
-            if mask is not None:
-                mask_arg = mask[:, col]
             rank_sorted_1d(
                 out[:, col],
-                masked_vals[:, col],
-                sort_indexer[:, col],
                 grp_sizes,
+                sort_indexer[:, col],
+                masked_vals[:, col],
+                mask[:, col],
+                check_mask,
                 n,
-                mask=mask_arg,
                 tiebreak=tiebreak,
                 keep_na=keep_na,
                 pct=pct,

From e92d7c555dfaaf1cf3501b2643095309ae0d84dc Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Tue, 29 Jun 2021 22:47:43 -0400
Subject: [PATCH 25/25] array -> asarray and better arg calling

---
 pandas/_libs/algos.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 1bfbe49e0a82f..172f2bfb49160 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -448,8 +448,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
                             with gil:
                                 # We need to slice back to nobs because rank_1d will
                                 # require arrays of nobs length
-                                rankedx = rank_1d(np.array(maskedx)[:nobs])
-                                rankedy = rank_1d(np.array(maskedy)[:nobs])
+                                rankedx = rank_1d(np.asarray(maskedx)[:nobs])
+                                rankedy = rank_1d(np.asarray(maskedy)[:nobs])
                             for i in range(nobs):
                                 maskedx[i] = rankedx[i]
                                 maskedy[i] = rankedy[i]
@@ -1074,8 +1074,8 @@ def rank_1d(
             lexsort_indexer,
             masked_vals_memview,
             mask,
-            check_mask,
-            N,
+            check_mask=check_mask,
+            N=N,
             tiebreak=tiebreak,
             keep_na=keep_na,
             pct=pct,
@@ -1443,8 +1443,8 @@ def rank_2d(
                 sort_indexer[:, col],
                 masked_vals[:, col],
                 mask[:, col],
-                check_mask,
-                n,
+                check_mask=check_mask,
+                N=n,
                 tiebreak=tiebreak,
                 keep_na=keep_na,
                 pct=pct,