From 56e7de735d37bb5594585e0f14ab88cf5a5e1a98 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 15 Oct 2019 09:15:18 -0700 Subject: [PATCH 1/3] REF: use fused types for rank_1d --- pandas/_libs/algos_rank_helper.pxi.in | 246 ++++++++++++++++---------- 1 file changed, 151 insertions(+), 95 deletions(-) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 1ba1667b687be..c7441aef90c13 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -8,24 +8,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # rank_1d, rank_2d # ---------------------------------------------------------------------- -{{py: - -# dtype ctype pos_nan_value neg_nan_value -dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'), - ('float64', 'float64_t', 'np.inf', '-np.inf'), - ('uint64', 'uint64_t', '', ''), - ('int64', 'int64_t', 'np.iinfo(np.int64).max', - 'np.iinfo(np.int64).min')] - -}} - -{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}} +ctypedef fused rank_t: + object + float64_t + uint64_t + int64_t @cython.wraparound(False) @cython.boundscheck(False) -def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', - ascending=True, na_option='keep', pct=False): +def rank_1d(rank_t[:] in_arr, ties_method='average', + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -33,85 +26,86 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', cdef: Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 - {{if dtype == 'object'}} - ndarray sorted_data, values - {{else}} - ndarray[{{ctype}}] sorted_data, values - {{endif}} + ndarray[rank_t] sorted_data, values ndarray[float64_t] ranks ndarray[int64_t] argsorted ndarray[uint8_t, cast=True] sorted_mask - {{if dtype == 'uint64'}} - {{ctype}} val - {{else}} - {{ctype}} val, nan_value - {{endif}} + rank_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 - bint isnan + bint isnan, condition float64_t count = 0.0 + tiebreak = tiebreakers[ties_method] - {{if dtype == 'float64'}} - values = np.asarray(in_arr).copy() - {{elif dtype == 'object'}} - values = np.array(in_arr, copy=True) + if rank_t is float64_t: + values = np.asarray(in_arr).copy() + elif rank_t is object: + values = np.array(in_arr, copy=True) - if values.dtype != np.object_: - values = values.astype('O') - {{else}} - values = np.asarray(in_arr) - {{endif}} + if values.dtype != np.object_: + values = values.astype('O') + else: + values = np.asarray(in_arr) keep_na = na_option == 'keep' - {{if dtype == 'object'}} - mask = missing.isnaobj(values) - {{elif dtype == 'float64'}} - mask = np.isnan(values) - {{elif dtype == 'int64'}} - mask = values == NPY_NAT + if rank_t is object: + mask = missing.isnaobj(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT - # create copy in case of NPY_NAT - # values are mutated inplace - if mask.any(): - values = values.copy() - {{endif}} + # create copy in case of NPY_NAT + # values are mutated inplace + if mask.any(): + values = values.copy() # double sort first by mask and then by values to ensure nan values are # either at the beginning or the end. mask/(~mask) controls padding at # tail or the head - {{if dtype != 'uint64'}} - if ascending ^ (na_option == 'top'): - nan_value = {{pos_nan_value}} - order = (values, mask) + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + order = (values, mask) + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).min + + order = (values, ~mask) + np.putmask(values, mask, nan_value) else: - nan_value = {{neg_nan_value}} - order = (values, ~mask) - np.putmask(values, mask, nan_value) - {{else}} - mask = np.zeros(shape=len(values), dtype=bool) - order = (values, mask) - {{endif}} + mask = np.zeros(shape=len(values), dtype=bool) + order = (values, mask) n = len(values) ranks = np.empty(n, dtype='f8') - {{if dtype == 'object'}} - _as = np.lexsort(keys=order) - {{else}} - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here + if rank_t is object: _as = np.lexsort(keys=order) - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING else: - _as = np.lexsort(keys=order) - {{endif}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = np.lexsort(keys=order) + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = np.lexsort(keys=order) if not ascending: _as = _as[::-1] @@ -122,38 +116,30 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', non_na_idx = _indices[0] if len(_indices) > 0 else -1 argsorted = _as.astype('i8') - {{if dtype == 'object'}} - if True: - {{else}} - with nogil: - {{endif}} - # TODO: why does the 2d version not have a nogil block? + if rank_t is object: + # TODO: de-duplicate once cython supports conditional nogil for i in range(n): sum_ranks += i + 1 dups += 1 - {{if dtype == 'object'}} - val = util.get_value_at(sorted_data, i) - {{else}} val = sorted_data[i] - {{endif}} - {{if dtype != 'uint64'}} - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - {{endif}} + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue count += 1.0 - {{if dtype == 'object'}} - if (i == n - 1 or - are_diff(util.get_value_at(sorted_data, i + 1), val) or - i == non_na_idx): - {{else}} - if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx): - {{endif}} + if rank_t is object: + condition = (i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx) + else: + condition = (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx) + + if condition: if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): @@ -165,13 +151,12 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: - {{if dtype == 'object'}} - raise ValueError('first not supported for ' - 'non-numeric data') - {{else}} - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - {{endif}} + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 @@ -180,6 +165,58 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 + + else: + with nogil: + # TODO: why does the 2d version not have a nogil block? + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = (i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx) + else: + condition = (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + if pct: if tiebreak == TIEBREAK_DENSE: return ranks / total_tie_count @@ -188,6 +225,25 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', else: return ranks +rank_1d_object = rank_1d["object"] +rank_1d_float64 = rank_1d["float64_t"] +rank_1d_uint64 = rank_1d["uint64_t"] +rank_1d_int64 = rank_1d["int64_t"] + + +{{py: + +# dtype ctype pos_nan_value neg_nan_value +dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'), + ('float64', 'float64_t', 'np.inf', '-np.inf'), + ('uint64', 'uint64_t', '', ''), + ('int64', 'int64_t', 'np.iinfo(np.int64).max', + 'np.iinfo(np.int64).min')] + +}} + +{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}} + def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', ascending=True, na_option='keep', pct=False): From 1433942ef345486fbcfedd4d0b852914b64b4a15 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 15 Oct 2019 10:11:15 -0700 Subject: [PATCH 2/3] fused types for rank_2d --- pandas/_libs/algos_rank_helper.pxi.in | 194 ++++++++++++-------------- 1 file changed, 89 insertions(+), 105 deletions(-) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index c7441aef90c13..6adac96f12faf 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -225,28 +225,15 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', else: return ranks + rank_1d_object = rank_1d["object"] rank_1d_float64 = rank_1d["float64_t"] rank_1d_uint64 = rank_1d["uint64_t"] rank_1d_int64 = rank_1d["int64_t"] -{{py: - -# dtype ctype pos_nan_value neg_nan_value -dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'), - ('float64', 'float64_t', 'np.inf', '-np.inf'), - ('uint64', 'uint64_t', '', ''), - ('int64', 'int64_t', 'np.iinfo(np.int64).max', - 'np.iinfo(np.int64).min')] - -}} - -{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}} - - -def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): +def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -254,29 +241,20 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', cdef: Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - {{if dtype == 'object'}} Py_ssize_t infs - {{endif}} ndarray[float64_t, ndim=2] ranks - {{if dtype == 'int64' or dtype == 'uint64'}} - ndarray[{{ctype}}, ndim=2, cast=True] values - {{else}} - ndarray[{{ctype}}, ndim=2] values - {{endif}} + ndarray[rank_t, ndim=2] values ndarray[int64_t, ndim=2] argsorted - {{if dtype == 'uint64'}} - {{ctype}} val - {{else}} - {{ctype}} val, nan_value - {{endif}} + rank_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 float64_t count = 0.0 + bint condition, skip_condition tiebreak = tiebreakers[ties_method] @@ -287,103 +265,106 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', else: values = np.asarray(in_arr).copy() - {{if dtype == 'object'}} - if values.dtype != np.object_: - values = values.astype('O') - {{endif}} + if rank_t is object: + if values.dtype != np.object_: + values = values.astype('O') - {{if dtype != 'uint64'}} - if ascending ^ (na_option == 'top'): - nan_value = {{pos_nan_value}} - else: - nan_value = {{neg_nan_value}} + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = NPY_NAT - {{if dtype == 'object'}} - mask = missing.isnaobj2d(values) - {{elif dtype == 'float64'}} - mask = np.isnan(values) - {{elif dtype == 'int64'}} - mask = values == NPY_NAT - {{endif}} + if rank_t is object: + mask = missing.isnaobj2d(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT - np.putmask(values, mask, nan_value) - {{endif}} + np.putmask(values, mask, nan_value) n, k = (values).shape ranks = np.empty((n, k), dtype='f8') - {{if dtype == 'object'}} - try: - _as = values.argsort(1) - except TypeError: - values = in_arr - for i in range(len(values)): - ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, - ascending=ascending, pct=pct) - if axis == 0: - return ranks.T - else: - return ranks - {{else}} - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING + if rank_t is object: + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, + ascending=ascending, pct=pct) + if axis == 0: + return ranks.T + else: + return ranks else: - _as = values.argsort(1) - {{endif}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) if not ascending: _as = _as[:, ::-1] - values = _take_2d_{{dtype}}(values, _as) + values = _take_2d(values, _as) argsorted = _as.astype('i8') for i in range(n): - {{if dtype == 'object'}} - dups = sum_ranks = infs = 0 - {{else}} - dups = sum_ranks = 0 - {{endif}} + if rank_t is object: + dups = sum_ranks = infs = 0 + else: + dups = sum_ranks = 0 total_tie_count = 0 count = 0.0 for j in range(k): - {{if dtype != 'object'}} - sum_ranks += j + 1 - dups += 1 - {{endif}} + if rank_t is not object: + sum_ranks += j + 1 + dups += 1 val = values[i, j] - {{if dtype != 'uint64'}} - {{if dtype == 'object'}} - if (val is nan_value) and keep_na: - {{else}} - if (val == nan_value) and keep_na: - {{endif}} - ranks[i, argsorted[i, j]] = NaN + if rank_t is not uint64_t: + if rank_t is object: + skip_condition = (val is nan_value) and keep_na + else: + skip_condition = (val == nan_value) and keep_na + if skip_condition: + ranks[i, argsorted[i, j]] = NaN - {{if dtype == 'object'}} - infs += 1 - {{endif}} + if rank_t is object: + infs += 1 - continue - {{endif}} + continue count += 1.0 - {{if dtype == 'object'}} - sum_ranks += (j - infs) + 1 - dups += 1 - {{endif}} + if rank_t is object: + sum_ranks += (j - infs) + 1 + dups += 1 - {{if dtype == 'object'}} - if j == k - 1 or are_diff(values[i, j + 1], val): - {{else}} - if j == k - 1 or values[i, j + 1] != val: - {{endif}} + if rank_t is object: + condition = j == k - 1 or are_diff(values[i, j + 1], val) + else: + condition = j == k - 1 or values[i, j + 1] != val + + if condition: if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = sum_ranks / dups @@ -394,13 +375,12 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = j + 1 elif tiebreak == TIEBREAK_FIRST: - {{if dtype == 'object'}} - raise ValueError('first not supported ' - 'for non-numeric data') - {{else}} - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - {{endif}} + if rank_t is object: + raise ValueError('first not supported ' + 'for non-numeric data') + else: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 @@ -419,4 +399,8 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', else: return ranks -{{endfor}} + +rank_2d_object = rank_2d["object"] +rank_2d_float64 = rank_2d["float64_t"] +rank_2d_uint64 = rank_2d["uint64_t"] +rank_2d_int64 = rank_2d["int64_t"] From 894ebe6944e844b4a1933abe890c7c0102eb6c86 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 16:43:49 -0700 Subject: [PATCH 3/3] lint fixup --- pandas/_libs/algos_rank_helper.pxi.in | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 6adac96f12faf..d5a31b6a13010 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -137,7 +137,9 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', are_diff(sorted_data[i + 1], val) or i == non_na_idx) else: - condition = (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx) + condition = (i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx) if condition: @@ -188,7 +190,9 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', are_diff(sorted_data[i + 1], val) or i == non_na_idx) else: - condition = (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx) + condition = (i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx) if condition: