pandas/_libs/algos_rank_helper.pxi.in

"""
Template for each `dtype` helper function for rank

WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""

#----------------------------------------------------------------------
# rank_1d, rank_2d
#----------------------------------------------------------------------

{{py:

# dtype ctype pos_nan_value neg_nan_value
dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'),
          ('float64', 'float64_t', 'np.inf', '-np.inf'),
          ('uint64', 'uint64_t', '', ''),
          ('int64', 'int64_t', 'np.iinfo(np.int64).max',
           'np.iinfo(np.int64).min')]

}}

{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}}


@cython.wraparound(False)
@cython.boundscheck(False)
def rank_1d_{{dtype}}(object in_arr, ties_method='average',
                      ascending=True, na_option='keep', pct=False):
    """
    Fast NaN-friendly version of scipy.stats.rankdata
    """

    cdef:
        Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0

        {{if dtype == 'object'}}
        ndarray sorted_data, values
        {{else}}
        ndarray[{{ctype}}] sorted_data, values
        {{endif}}

        ndarray[float64_t] ranks
        ndarray[int64_t] argsorted
        ndarray[uint8_t, cast=True] sorted_mask

        {{if dtype == 'uint64'}}
        {{ctype}} val
        {{else}}
        {{ctype}} val, nan_value
        {{endif}}

        float64_t sum_ranks = 0
        int tiebreak = 0
        bint keep_na = 0
        bint isnan
        float count = 0.0
    tiebreak = tiebreakers[ties_method]

    {{if dtype == 'float64'}}
    values = np.asarray(in_arr).copy()
    {{elif dtype == 'object'}}
    values = np.array(in_arr, copy=True)

    if values.dtype != np.object_:
        values = values.astype('O')
    {{else}}
    values = np.asarray(in_arr)
    {{endif}}

    keep_na = na_option == 'keep'

    {{if dtype == 'object'}}
    mask = missing.isnaobj(values)
    {{elif dtype == 'float64'}}
    mask = np.isnan(values)
    {{elif dtype == 'int64'}}
    mask = values == iNaT

    # create copy in case of iNaT
    # values are mutated inplace
    if mask.any():
        values = values.copy()
    {{endif}}

    # double sort first by mask and then by values to ensure nan values are
    # either at the beginning or the end. mask/(~mask) controls padding at
    # tail or the head
    {{if dtype != 'uint64'}}
    if ascending ^ (na_option == 'top'):
        nan_value = {{pos_nan_value}}
        order = (values, mask)
    else:
        nan_value = {{neg_nan_value}}
        order = (values, ~mask)
    np.putmask(values, mask, nan_value)
    {{else}}
    mask = np.zeros(shape=len(values), dtype=bool)
    order = (values, mask)
    {{endif}}

    n = len(values)
    ranks = np.empty(n, dtype='f8')

    {{if dtype == 'object'}}

    try:
        _as = np.lexsort(keys=order)
    except TypeError:
        # lexsort on object array will raise TypeError for numpy version
        # earlier than 1.11.0. Use argsort with order argument instead.
        _dt = [('values', 'O'), ('mask', '?')]
        _values = np.asarray(list(zip(order[0], order[1])), dtype=_dt)
        _as = np.argsort(_values, kind='mergesort', order=('mask', 'values'))
    {{else}}
    if tiebreak == TIEBREAK_FIRST:
        # need to use a stable sort here
        _as = np.lexsort(keys=order)
        if not ascending:
            tiebreak = TIEBREAK_FIRST_DESCENDING
    else:
        _as = np.lexsort(keys=order)
    {{endif}}

    if not ascending:
        _as = _as[::-1]

    sorted_data = values.take(_as)
    sorted_mask = mask.take(_as)
    _indices = np.diff(sorted_mask).nonzero()[0]
    non_na_idx = _indices[0] if len(_indices) > 0 else -1
    argsorted = _as.astype('i8')

    {{if dtype == 'object'}}
    for i in range(n):
        sum_ranks += i + 1
        dups += 1
        isnan = sorted_mask[i]
        val = util.get_value_at(sorted_data, i)

        if isnan and keep_na:
            ranks[argsorted[i]] = nan
            continue
        count += 1.0

        if (i == n - 1 or
                are_diff(util.get_value_at(sorted_data, i + 1), val) or
                i == non_na_idx):
            if tiebreak == TIEBREAK_AVERAGE:
                for j in range(i - dups + 1, i + 1):
                    ranks[argsorted[j]] = sum_ranks / dups
            elif tiebreak == TIEBREAK_MIN:
                for j in range(i - dups + 1, i + 1):
                    ranks[argsorted[j]] = i - dups + 2
            elif tiebreak == TIEBREAK_MAX:
                for j in range(i - dups + 1, i + 1):
                    ranks[argsorted[j]] = i + 1
            elif tiebreak == TIEBREAK_FIRST:
                raise ValueError('first not supported for non-numeric data')
            elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                for j in range(i - dups + 1, i + 1):
                    ranks[argsorted[j]] = 2 * i - j - dups + 2
            elif tiebreak == TIEBREAK_DENSE:
                total_tie_count += 1
                for j in range(i - dups + 1, i + 1):
                    ranks[argsorted[j]] = total_tie_count
            sum_ranks = dups = 0
    {{else}}
    with nogil:
        for i in range(n):
            sum_ranks += i + 1
            dups += 1
            val = sorted_data[i]

            {{if dtype != 'uint64'}}
            isnan = sorted_mask[i]
            if isnan and keep_na:
                ranks[argsorted[i]] = nan
                continue
            {{endif}}

            count += 1.0

            if (i == n - 1 or sorted_data[i + 1] != val or
                i == non_na_idx):
                if tiebreak == TIEBREAK_AVERAGE:
                    for j in range(i - dups + 1, i + 1):
                        ranks[argsorted[j]] = sum_ranks / dups
                elif tiebreak == TIEBREAK_MIN:
                    for j in range(i - dups + 1, i + 1):
                        ranks[argsorted[j]] = i - dups + 2
                elif tiebreak == TIEBREAK_MAX:
                    for j in range(i - dups + 1, i + 1):
                        ranks[argsorted[j]] = i + 1
                elif tiebreak == TIEBREAK_FIRST:
                    for j in range(i - dups + 1, i + 1):
                        ranks[argsorted[j]] = j + 1
                elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                    for j in range(i - dups + 1, i + 1):
                        ranks[argsorted[j]] = 2 * i - j - dups + 2
                elif tiebreak == TIEBREAK_DENSE:
                    total_tie_count += 1
                    for j in range(i - dups + 1, i + 1):
                        ranks[argsorted[j]] = total_tie_count
                sum_ranks = dups = 0
    {{endif}}
    if pct:
        if tiebreak == TIEBREAK_DENSE:
            return ranks / total_tie_count
        else:
            return ranks / count
    else:
        return ranks


def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
                      ascending=True, na_option='keep', pct=False):
    """
    Fast NaN-friendly version of scipy.stats.rankdata
    """

    cdef:
        Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0

        {{if dtype == 'object'}}
        Py_ssize_t infs
        {{endif}}

        ndarray[float64_t, ndim=2] ranks
        {{if dtype == 'int64' or dtype == 'uint64'}}
        ndarray[{{ctype}}, ndim=2, cast=True] values
        {{else}}
        ndarray[{{ctype}}, ndim=2] values
        {{endif}}

        ndarray[int64_t, ndim=2] argsorted

        {{if dtype == 'uint64'}}
        {{ctype}} val
        {{else}}
        {{ctype}} val, nan_value
        {{endif}}

        float64_t sum_ranks = 0
        int tiebreak = 0
        bint keep_na = 0
        float count = 0.0

    tiebreak = tiebreakers[ties_method]

    keep_na = na_option == 'keep'

    in_arr = np.asarray(in_arr)

    if axis == 0:
        values = in_arr.T.copy()
    else:
        values = in_arr.copy()

    {{if dtype == 'object'}}
    if values.dtype != np.object_:
        values = values.astype('O')
    {{endif}}

    {{if dtype != 'uint64'}}
    if ascending ^ (na_option == 'top'):
        nan_value = {{pos_nan_value}}
    else:
        nan_value = {{neg_nan_value}}

    {{if dtype == 'object'}}
    mask = missing.isnaobj2d(values)
    {{elif dtype == 'float64'}}
    mask = np.isnan(values)
    {{elif dtype == 'int64'}}
    mask = values == iNaT
    {{endif}}

    np.putmask(values, mask, nan_value)
    {{endif}}

    n, k = (<object> values).shape
    ranks = np.empty((n, k), dtype='f8')

    {{if dtype == 'object'}}
    try:
        _as = values.argsort(1)
    except TypeError:
        values = in_arr
        for i in range(len(values)):
            ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method,
                                      ascending=ascending, pct=pct)
        if axis == 0:
            return ranks.T
        else:
            return ranks
    {{else}}
    if tiebreak == TIEBREAK_FIRST:
        # need to use a stable sort here
        _as = values.argsort(axis=1, kind='mergesort')
        if not ascending:
            tiebreak = TIEBREAK_FIRST_DESCENDING
    else:
        _as = values.argsort(1)
    {{endif}}

    if not ascending:
        _as = _as[:, ::-1]

    values = _take_2d_{{dtype}}(values, _as)
    argsorted = _as.astype('i8')

    for i in range(n):
        {{if dtype == 'object'}}
        dups = sum_ranks = infs = 0
        {{else}}
        dups = sum_ranks = 0
        {{endif}}

        total_tie_count = 0
        count = 0.0
        for j in range(k):
            {{if dtype != 'object'}}
            sum_ranks += j + 1
            dups += 1
            {{endif}}

            val = values[i, j]

            {{if dtype != 'uint64'}}
            {{if dtype == 'object'}}
            if (val is nan_value) and keep_na:
            {{else}}
            if (val == nan_value) and keep_na:
            {{endif}}
                ranks[i, argsorted[i, j]] = nan

                {{if dtype == 'object'}}
                infs += 1
                {{endif}}

                continue
            {{endif}}

            count += 1.0

            {{if dtype == 'object'}}
            sum_ranks += (j - infs) + 1
            dups += 1
            {{endif}}

            {{if dtype == 'object'}}
            if j == k - 1 or are_diff(values[i, j + 1], val):
            {{else}}
            if j == k - 1 or values[i, j + 1] != val:
            {{endif}}
                if tiebreak == TIEBREAK_AVERAGE:
                    for z in range(j - dups + 1, j + 1):
                        ranks[i, argsorted[i, z]] = sum_ranks / dups
                elif tiebreak == TIEBREAK_MIN:
                    for z in range(j - dups + 1, j + 1):
                        ranks[i, argsorted[i, z]] = j - dups + 2
                elif tiebreak == TIEBREAK_MAX:
                    for z in range(j - dups + 1, j + 1):
                        ranks[i, argsorted[i, z]] = j + 1
                elif tiebreak == TIEBREAK_FIRST:
                    {{if dtype == 'object'}}
                    raise ValueError('first not supported '
                                     'for non-numeric data')
                    {{else}}
                    for z in range(j - dups + 1, j + 1):
                        ranks[i, argsorted[i, z]] = z + 1
                    {{endif}}
                elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                    for z in range(j - dups + 1, j + 1):
                        ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
                elif tiebreak == TIEBREAK_DENSE:
                    total_tie_count += 1
                    for z in range(j - dups + 1, j + 1):
                        ranks[i, argsorted[i, z]] = total_tie_count
                sum_ranks = dups = 0
        if pct:
            if tiebreak == TIEBREAK_DENSE:
                ranks[i, :] /= total_tie_count
            else:
                ranks[i, :] /= count
    if axis == 0:
        return ranks.T
    else:
        return ranks

{{endfor}}