From 56e7de735d37bb5594585e0f14ab88cf5a5e1a98 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Tue, 15 Oct 2019 09:15:18 -0700
Subject: [PATCH 1/3] REF: use fused types for rank_1d

---
 pandas/_libs/algos_rank_helper.pxi.in | 246 ++++++++++++++++----------
 1 file changed, 151 insertions(+), 95 deletions(-)

diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in
index 1ba1667b687be..c7441aef90c13 100644
--- a/pandas/_libs/algos_rank_helper.pxi.in
+++ b/pandas/_libs/algos_rank_helper.pxi.in
@@ -8,24 +8,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 # rank_1d, rank_2d
 # ----------------------------------------------------------------------
 
-{{py:
-
-# dtype ctype pos_nan_value neg_nan_value
-dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'),
-          ('float64', 'float64_t', 'np.inf', '-np.inf'),
-          ('uint64', 'uint64_t', '', ''),
-          ('int64', 'int64_t', 'np.iinfo(np.int64).max',
-           'np.iinfo(np.int64).min')]
-
-}}
-
-{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}}
+ctypedef fused rank_t:
+    object
+    float64_t
+    uint64_t
+    int64_t
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average',
-                      ascending=True, na_option='keep', pct=False):
+def rank_1d(rank_t[:] in_arr, ties_method='average',
+            ascending=True, na_option='keep', pct=False):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -33,85 +26,86 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average',
     cdef:
         Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0
 
-        {{if dtype == 'object'}}
-        ndarray sorted_data, values
-        {{else}}
-        ndarray[{{ctype}}] sorted_data, values
-        {{endif}}
+        ndarray[rank_t] sorted_data, values
 
         ndarray[float64_t] ranks
         ndarray[int64_t] argsorted
         ndarray[uint8_t, cast=True] sorted_mask
 
-        {{if dtype == 'uint64'}}
-        {{ctype}} val
-        {{else}}
-        {{ctype}} val, nan_value
-        {{endif}}
+        rank_t val, nan_value
 
         float64_t sum_ranks = 0
         int tiebreak = 0
         bint keep_na = 0
-        bint isnan
+        bint isnan, condition
         float64_t count = 0.0
+
     tiebreak = tiebreakers[ties_method]
 
-    {{if dtype == 'float64'}}
-    values = np.asarray(in_arr).copy()
-    {{elif dtype == 'object'}}
-    values = np.array(in_arr, copy=True)
+    if rank_t is float64_t:
+        values = np.asarray(in_arr).copy()
+    elif rank_t is object:
+        values = np.array(in_arr, copy=True)
 
-    if values.dtype != np.object_:
-        values = values.astype('O')
-    {{else}}
-    values = np.asarray(in_arr)
-    {{endif}}
+        if values.dtype != np.object_:
+            values = values.astype('O')
+    else:
+        values = np.asarray(in_arr)
 
     keep_na = na_option == 'keep'
 
-    {{if dtype == 'object'}}
-    mask = missing.isnaobj(values)
-    {{elif dtype == 'float64'}}
-    mask = np.isnan(values)
-    {{elif dtype == 'int64'}}
-    mask = values == NPY_NAT
+    if rank_t is object:
+        mask = missing.isnaobj(values)
+    elif rank_t is float64_t:
+        mask = np.isnan(values)
+    elif rank_t is int64_t:
+        mask = values == NPY_NAT
 
-    # create copy in case of NPY_NAT
-    # values are mutated inplace
-    if mask.any():
-        values = values.copy()
-    {{endif}}
+        # create copy in case of NPY_NAT
+        # values are mutated inplace
+        if mask.any():
+            values = values.copy()
 
     # double sort first by mask and then by values to ensure nan values are
     # either at the beginning or the end. mask/(~mask) controls padding at
     # tail or the head
-    {{if dtype != 'uint64'}}
-    if ascending ^ (na_option == 'top'):
-        nan_value = {{pos_nan_value}}
-        order = (values, mask)
+    if rank_t is not uint64_t:
+        if ascending ^ (na_option == 'top'):
+            if rank_t is object:
+                nan_value = Infinity()
+            elif rank_t is float64_t:
+                nan_value = np.inf
+            elif rank_t is int64_t:
+                nan_value = np.iinfo(np.int64).max
+
+            order = (values, mask)
+        else:
+            if rank_t is object:
+                nan_value = NegInfinity()
+            elif rank_t is float64_t:
+                nan_value = -np.inf
+            elif rank_t is int64_t:
+                nan_value = np.iinfo(np.int64).min
+
+            order = (values, ~mask)
+        np.putmask(values, mask, nan_value)
     else:
-        nan_value = {{neg_nan_value}}
-        order = (values, ~mask)
-    np.putmask(values, mask, nan_value)
-    {{else}}
-    mask = np.zeros(shape=len(values), dtype=bool)
-    order = (values, mask)
-    {{endif}}
+        mask = np.zeros(shape=len(values), dtype=bool)
+        order = (values, mask)
 
     n = len(values)
     ranks = np.empty(n, dtype='f8')
 
-    {{if dtype == 'object'}}
-    _as = np.lexsort(keys=order)
-    {{else}}
-    if tiebreak == TIEBREAK_FIRST:
-        # need to use a stable sort here
+    if rank_t is object:
         _as = np.lexsort(keys=order)
-        if not ascending:
-            tiebreak = TIEBREAK_FIRST_DESCENDING
     else:
-        _as = np.lexsort(keys=order)
-    {{endif}}
+        if tiebreak == TIEBREAK_FIRST:
+            # need to use a stable sort here
+            _as = np.lexsort(keys=order)
+            if not ascending:
+                tiebreak = TIEBREAK_FIRST_DESCENDING
+        else:
+            _as = np.lexsort(keys=order)
 
     if not ascending:
         _as = _as[::-1]
@@ -122,38 +116,30 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average',
     non_na_idx = _indices[0] if len(_indices) > 0 else -1
     argsorted = _as.astype('i8')
 
-    {{if dtype == 'object'}}
-    if True:
-    {{else}}
-    with nogil:
-    {{endif}}
-        # TODO: why does the 2d version not have a nogil block?
+    if rank_t is object:
+        # TODO: de-duplicate once cython supports conditional nogil
         for i in range(n):
             sum_ranks += i + 1
             dups += 1
 
-            {{if dtype == 'object'}}
-            val = util.get_value_at(sorted_data, i)
-            {{else}}
             val = sorted_data[i]
-            {{endif}}
 
-            {{if dtype != 'uint64'}}
-            isnan = sorted_mask[i]
-            if isnan and keep_na:
-                ranks[argsorted[i]] = NaN
-                continue
-            {{endif}}
+            if rank_t is not uint64_t:
+                isnan = sorted_mask[i]
+                if isnan and keep_na:
+                    ranks[argsorted[i]] = NaN
+                    continue
 
             count += 1.0
 
-            {{if dtype == 'object'}}
-            if (i == n - 1 or
-                    are_diff(util.get_value_at(sorted_data, i + 1), val) or
-                    i == non_na_idx):
-            {{else}}
-            if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx):
-            {{endif}}
+            if rank_t is object:
+                condition = (i == n - 1 or
+                    are_diff(sorted_data[i + 1], val) or
+                    i == non_na_idx)
+            else:
+                condition = (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx)
+
+            if condition:
 
                 if tiebreak == TIEBREAK_AVERAGE:
                     for j in range(i - dups + 1, i + 1):
@@ -165,13 +151,12 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average',
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = i + 1
                 elif tiebreak == TIEBREAK_FIRST:
-                    {{if dtype == 'object'}}
-                    raise ValueError('first not supported for '
-                                     'non-numeric data')
-                    {{else}}
-                    for j in range(i - dups + 1, i + 1):
-                        ranks[argsorted[j]] = j + 1
-                    {{endif}}
+                    if rank_t is object:
+                        raise ValueError('first not supported for '
+                                         'non-numeric data')
+                    else:
+                        for j in range(i - dups + 1, i + 1):
+                            ranks[argsorted[j]] = j + 1
                 elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = 2 * i - j - dups + 2
@@ -180,6 +165,58 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average',
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = total_tie_count
                 sum_ranks = dups = 0
+
+    else:
+        with nogil:
+            # TODO: why does the 2d version not have a nogil block?
+            for i in range(n):
+                sum_ranks += i + 1
+                dups += 1
+
+                val = sorted_data[i]
+
+                if rank_t is not uint64_t:
+                    isnan = sorted_mask[i]
+                    if isnan and keep_na:
+                        ranks[argsorted[i]] = NaN
+                        continue
+
+                count += 1.0
+
+                if rank_t is object:
+                    condition = (i == n - 1 or
+                        are_diff(sorted_data[i + 1], val) or
+                        i == non_na_idx)
+                else:
+                    condition = (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx)
+
+                if condition:
+
+                    if tiebreak == TIEBREAK_AVERAGE:
+                        for j in range(i - dups + 1, i + 1):
+                            ranks[argsorted[j]] = sum_ranks / dups
+                    elif tiebreak == TIEBREAK_MIN:
+                        for j in range(i - dups + 1, i + 1):
+                            ranks[argsorted[j]] = i - dups + 2
+                    elif tiebreak == TIEBREAK_MAX:
+                        for j in range(i - dups + 1, i + 1):
+                            ranks[argsorted[j]] = i + 1
+                    elif tiebreak == TIEBREAK_FIRST:
+                        if rank_t is object:
+                            raise ValueError('first not supported for '
+                                             'non-numeric data')
+                        else:
+                            for j in range(i - dups + 1, i + 1):
+                                ranks[argsorted[j]] = j + 1
+                    elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+                        for j in range(i - dups + 1, i + 1):
+                            ranks[argsorted[j]] = 2 * i - j - dups + 2
+                    elif tiebreak == TIEBREAK_DENSE:
+                        total_tie_count += 1
+                        for j in range(i - dups + 1, i + 1):
+                            ranks[argsorted[j]] = total_tie_count
+                    sum_ranks = dups = 0
+
     if pct:
         if tiebreak == TIEBREAK_DENSE:
             return ranks / total_tie_count
@@ -188,6 +225,25 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average',
     else:
         return ranks
 
+rank_1d_object = rank_1d["object"]
+rank_1d_float64 = rank_1d["float64_t"]
+rank_1d_uint64 = rank_1d["uint64_t"]
+rank_1d_int64 = rank_1d["int64_t"]
+
+
+{{py:
+
+# dtype ctype pos_nan_value neg_nan_value
+dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'),
+          ('float64', 'float64_t', 'np.inf', '-np.inf'),
+          ('uint64', 'uint64_t', '', ''),
+          ('int64', 'int64_t', 'np.iinfo(np.int64).max',
+           'np.iinfo(np.int64).min')]
+
+}}
+
+{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}}
+
 
 def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average',
                       ascending=True, na_option='keep', pct=False):

From 1433942ef345486fbcfedd4d0b852914b64b4a15 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Tue, 15 Oct 2019 10:11:15 -0700
Subject: [PATCH 2/3] fused types for rank_2d

---
 pandas/_libs/algos_rank_helper.pxi.in | 194 ++++++++++++--------------
 1 file changed, 89 insertions(+), 105 deletions(-)

diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in
index c7441aef90c13..6adac96f12faf 100644
--- a/pandas/_libs/algos_rank_helper.pxi.in
+++ b/pandas/_libs/algos_rank_helper.pxi.in
@@ -225,28 +225,15 @@ def rank_1d(rank_t[:] in_arr, ties_method='average',
     else:
         return ranks
 
+
 rank_1d_object = rank_1d["object"]
 rank_1d_float64 = rank_1d["float64_t"]
 rank_1d_uint64 = rank_1d["uint64_t"]
 rank_1d_int64 = rank_1d["int64_t"]
 
 
-{{py:
-
-# dtype ctype pos_nan_value neg_nan_value
-dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'),
-          ('float64', 'float64_t', 'np.inf', '-np.inf'),
-          ('uint64', 'uint64_t', '', ''),
-          ('int64', 'int64_t', 'np.iinfo(np.int64).max',
-           'np.iinfo(np.int64).min')]
-
-}}
-
-{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}}
-
-
-def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average',
-                      ascending=True, na_option='keep', pct=False):
+def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average',
+            ascending=True, na_option='keep', pct=False):
     """
     Fast NaN-friendly version of scipy.stats.rankdata
     """
@@ -254,29 +241,20 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average',
     cdef:
         Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
 
-        {{if dtype == 'object'}}
         Py_ssize_t infs
-        {{endif}}
 
         ndarray[float64_t, ndim=2] ranks
-        {{if dtype == 'int64' or dtype == 'uint64'}}
-        ndarray[{{ctype}}, ndim=2, cast=True] values
-        {{else}}
-        ndarray[{{ctype}}, ndim=2] values
-        {{endif}}
+        ndarray[rank_t, ndim=2] values
 
         ndarray[int64_t, ndim=2] argsorted
 
-        {{if dtype == 'uint64'}}
-        {{ctype}} val
-        {{else}}
-        {{ctype}} val, nan_value
-        {{endif}}
+        rank_t val, nan_value
 
         float64_t sum_ranks = 0
         int tiebreak = 0
         bint keep_na = 0
         float64_t count = 0.0
+        bint condition, skip_condition
 
     tiebreak = tiebreakers[ties_method]
 
@@ -287,103 +265,106 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average',
     else:
         values = np.asarray(in_arr).copy()
 
-    {{if dtype == 'object'}}
-    if values.dtype != np.object_:
-        values = values.astype('O')
-    {{endif}}
+    if rank_t is object:
+        if values.dtype != np.object_:
+            values = values.astype('O')
 
-    {{if dtype != 'uint64'}}
-    if ascending ^ (na_option == 'top'):
-        nan_value = {{pos_nan_value}}
-    else:
-        nan_value = {{neg_nan_value}}
+    if rank_t is not uint64_t:
+        if ascending ^ (na_option == 'top'):
+            if rank_t is object:
+                nan_value = Infinity()
+            elif rank_t is float64_t:
+                nan_value = np.inf
+            elif rank_t is int64_t:
+                nan_value = np.iinfo(np.int64).max
+
+        else:
+            if rank_t is object:
+                nan_value = NegInfinity()
+            elif rank_t is float64_t:
+                nan_value = -np.inf
+            elif rank_t is int64_t:
+                nan_value = NPY_NAT
 
-    {{if dtype == 'object'}}
-    mask = missing.isnaobj2d(values)
-    {{elif dtype == 'float64'}}
-    mask = np.isnan(values)
-    {{elif dtype == 'int64'}}
-    mask = values == NPY_NAT
-    {{endif}}
+        if rank_t is object:
+            mask = missing.isnaobj2d(values)
+        elif rank_t is float64_t:
+            mask = np.isnan(values)
+        elif rank_t is int64_t:
+            mask = values == NPY_NAT
 
-    np.putmask(values, mask, nan_value)
-    {{endif}}
+        np.putmask(values, mask, nan_value)
 
     n, k = (<object>values).shape
     ranks = np.empty((n, k), dtype='f8')
 
-    {{if dtype == 'object'}}
-    try:
-        _as = values.argsort(1)
-    except TypeError:
-        values = in_arr
-        for i in range(len(values)):
-            ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method,
-                                      ascending=ascending, pct=pct)
-        if axis == 0:
-            return ranks.T
-        else:
-            return ranks
-    {{else}}
-    if tiebreak == TIEBREAK_FIRST:
-        # need to use a stable sort here
-        _as = values.argsort(axis=1, kind='mergesort')
-        if not ascending:
-            tiebreak = TIEBREAK_FIRST_DESCENDING
+    if rank_t is object:
+        try:
+            _as = values.argsort(1)
+        except TypeError:
+            values = in_arr
+            for i in range(len(values)):
+                ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method,
+                                          ascending=ascending, pct=pct)
+            if axis == 0:
+                return ranks.T
+            else:
+                return ranks
     else:
-        _as = values.argsort(1)
-    {{endif}}
+        if tiebreak == TIEBREAK_FIRST:
+            # need to use a stable sort here
+            _as = values.argsort(axis=1, kind='mergesort')
+            if not ascending:
+                tiebreak = TIEBREAK_FIRST_DESCENDING
+        else:
+            _as = values.argsort(1)
 
     if not ascending:
         _as = _as[:, ::-1]
 
-    values = _take_2d_{{dtype}}(values, _as)
+    values = _take_2d(values, _as)
     argsorted = _as.astype('i8')
 
     for i in range(n):
-        {{if dtype == 'object'}}
-        dups = sum_ranks = infs = 0
-        {{else}}
-        dups = sum_ranks = 0
-        {{endif}}
+        if rank_t is object:
+            dups = sum_ranks = infs = 0
+        else:
+            dups = sum_ranks = 0
 
         total_tie_count = 0
         count = 0.0
         for j in range(k):
-            {{if dtype != 'object'}}
-            sum_ranks += j + 1
-            dups += 1
-            {{endif}}
+            if rank_t is not object:
+                sum_ranks += j + 1
+                dups += 1
 
             val = values[i, j]
 
-            {{if dtype != 'uint64'}}
-            {{if dtype == 'object'}}
-            if (val is nan_value) and keep_na:
-            {{else}}
-            if (val == nan_value) and keep_na:
-            {{endif}}
-                ranks[i, argsorted[i, j]] = NaN
+            if rank_t is not uint64_t:
+                if rank_t is object:
+                    skip_condition = (val is nan_value) and keep_na
+                else:
+                    skip_condition = (val == nan_value) and keep_na
+                if skip_condition:
+                    ranks[i, argsorted[i, j]] = NaN
 
-                {{if dtype == 'object'}}
-                infs += 1
-                {{endif}}
+                    if rank_t is object:
+                        infs += 1
 
-                continue
-            {{endif}}
+                    continue
 
             count += 1.0
 
-            {{if dtype == 'object'}}
-            sum_ranks += (j - infs) + 1
-            dups += 1
-            {{endif}}
+            if rank_t is object:
+                sum_ranks += (j - infs) + 1
+                dups += 1
 
-            {{if dtype == 'object'}}
-            if j == k - 1 or are_diff(values[i, j + 1], val):
-            {{else}}
-            if j == k - 1 or values[i, j + 1] != val:
-            {{endif}}
+            if rank_t is object:
+                condition = j == k - 1 or are_diff(values[i, j + 1], val)
+            else:
+                condition = j == k - 1 or values[i, j + 1] != val
+
+            if condition:
                 if tiebreak == TIEBREAK_AVERAGE:
                     for z in range(j - dups + 1, j + 1):
                         ranks[i, argsorted[i, z]] = sum_ranks / dups
@@ -394,13 +375,12 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average',
                     for z in range(j - dups + 1, j + 1):
                         ranks[i, argsorted[i, z]] = j + 1
                 elif tiebreak == TIEBREAK_FIRST:
-                    {{if dtype == 'object'}}
-                    raise ValueError('first not supported '
-                                     'for non-numeric data')
-                    {{else}}
-                    for z in range(j - dups + 1, j + 1):
-                        ranks[i, argsorted[i, z]] = z + 1
-                    {{endif}}
+                    if rank_t is object:
+                        raise ValueError('first not supported '
+                                         'for non-numeric data')
+                    else:
+                        for z in range(j - dups + 1, j + 1):
+                            ranks[i, argsorted[i, z]] = z + 1
                 elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                     for z in range(j - dups + 1, j + 1):
                         ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
@@ -419,4 +399,8 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average',
     else:
         return ranks
 
-{{endfor}}
+
+rank_2d_object = rank_2d["object"]
+rank_2d_float64 = rank_2d["float64_t"]
+rank_2d_uint64 = rank_2d["uint64_t"]
+rank_2d_int64 = rank_2d["int64_t"]

From 894ebe6944e844b4a1933abe890c7c0102eb6c86 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 16 Oct 2019 16:43:49 -0700
Subject: [PATCH 3/3] lint fixup

---
 pandas/_libs/algos_rank_helper.pxi.in | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in
index 6adac96f12faf..d5a31b6a13010 100644
--- a/pandas/_libs/algos_rank_helper.pxi.in
+++ b/pandas/_libs/algos_rank_helper.pxi.in
@@ -137,7 +137,9 @@ def rank_1d(rank_t[:] in_arr, ties_method='average',
                     are_diff(sorted_data[i + 1], val) or
                     i == non_na_idx)
             else:
-                condition = (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx)
+                condition = (i == n - 1 or
+                    sorted_data[i + 1] != val or
+                    i == non_na_idx)
 
             if condition:
 
@@ -188,7 +190,9 @@ def rank_1d(rank_t[:] in_arr, ties_method='average',
                         are_diff(sorted_data[i + 1], val) or
                         i == non_na_idx)
                 else:
-                    condition = (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx)
+                    condition = (i == n - 1 or
+                        sorted_data[i + 1] != val or
+                        i == non_na_idx)
 
                 if condition: