From 96956058b7b2330a9872508f57713c5f07a1b8f6 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 27 Dec 2020 00:18:31 -0500 Subject: [PATCH 01/15] WIP --- asv_bench/benchmarks/series_methods.py | 13 + pandas/_libs/algos.pyx | 544 +++++++++++++++++-------- pandas/_libs/groupby.pyx | 158 +------ 3 files changed, 389 insertions(+), 326 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 2db46abca119c..b501fbd687cf2 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -348,5 +348,18 @@ def setup(self, func, N, dtype): def time_func(self, func, N, dtype): self.func() +class Rank: + + param_names = ["dtype"] + params = [ + ["int", "uint", "float", "object"], + ] + + def setup(self, dtype): + self.s = Series(np.random.randint(0, 1000, size=100000), dtype=dtype) + + def time_frame_quantile(self, dtype): + self.s.rank() + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 4cddd49381a83..a0cd052c36818 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -789,220 +789,402 @@ ctypedef fused rank_t: int64_t -@cython.wraparound(False) +# @cython.wraparound(False) +# @cython.boundscheck(False) +# def rank_1d( +# ndarray[rank_t, ndim=1] in_arr, +# ties_method="average", +# bint ascending=True, +# na_option="keep", +# bint pct=False, +# ): +# """ +# Fast NaN-friendly version of ``scipy.stats.rankdata``. +# """ +# cdef: +# Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 +# ndarray[rank_t] sorted_data, values +# ndarray[float64_t] ranks +# ndarray[int64_t] argsorted +# ndarray[uint8_t, cast=True] sorted_mask +# rank_t val, nan_value +# float64_t sum_ranks = 0 +# int tiebreak = 0 +# bint keep_na = False +# bint isnan, condition +# float64_t count = 0.0 +# +# tiebreak = tiebreakers[ties_method] +# +# if rank_t is float64_t: +# values = np.asarray(in_arr).copy() +# elif rank_t is object: +# values = np.array(in_arr, copy=True) +# +# if values.dtype != np.object_: +# values = values.astype('O') +# else: +# values = np.asarray(in_arr).copy() +# +# keep_na = na_option == 'keep' +# +# if rank_t is object: +# mask = missing.isnaobj(values) +# elif rank_t is float64_t: +# mask = np.isnan(values) +# elif rank_t is int64_t: +# mask = values == NPY_NAT +# +# # double sort first by mask and then by values to ensure nan values are +# # either at the beginning or the end. mask/(~mask) controls padding at +# # tail or the head +# if rank_t is not uint64_t: +# if ascending ^ (na_option == 'top'): +# if rank_t is object: +# nan_value = Infinity() +# elif rank_t is float64_t: +# nan_value = np.inf +# elif rank_t is int64_t: +# nan_value = np.iinfo(np.int64).max +# +# order = (values, mask) +# else: +# if rank_t is object: +# nan_value = NegInfinity() +# elif rank_t is float64_t: +# nan_value = -np.inf +# elif rank_t is int64_t: +# nan_value = np.iinfo(np.int64).min +# +# order = (values, ~mask) +# np.putmask(values, mask, nan_value) +# else: +# mask = np.zeros(shape=len(values), dtype=bool) +# order = (values, mask) +# +# n = len(values) +# ranks = np.empty(n, dtype='f8') +# +# if rank_t is object: +# _as = np.lexsort(keys=order) +# else: +# if tiebreak == TIEBREAK_FIRST: +# # need to use a stable sort here +# _as = np.lexsort(keys=order) +# if not ascending: +# tiebreak = TIEBREAK_FIRST_DESCENDING +# else: +# _as = np.lexsort(keys=order) +# +# if not ascending: +# _as = _as[::-1] +# +# sorted_data = values.take(_as) +# sorted_mask = mask.take(_as) +# _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] +# non_na_idx = _indices[0] if len(_indices) > 0 else -1 +# argsorted = _as.astype('i8') +# +# if rank_t is object: +# # TODO: de-duplicate once cython supports conditional nogil +# for i in range(n): +# sum_ranks += i + 1 +# dups += 1 +# +# val = sorted_data[i] +# +# if rank_t is not uint64_t: +# isnan = sorted_mask[i] +# if isnan and keep_na: +# ranks[argsorted[i]] = NaN +# continue +# +# count += 1.0 +# +# if rank_t is object: +# condition = ( +# i == n - 1 or +# are_diff(sorted_data[i + 1], val) or +# i == non_na_idx +# ) +# else: +# condition = ( +# i == n - 1 or +# sorted_data[i + 1] != val or +# i == non_na_idx +# ) +# +# if condition: +# +# if tiebreak == TIEBREAK_AVERAGE: +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = sum_ranks / dups +# elif tiebreak == TIEBREAK_MIN: +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = i - dups + 2 +# elif tiebreak == TIEBREAK_MAX: +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = i + 1 +# elif tiebreak == TIEBREAK_FIRST: +# if rank_t is object: +# raise ValueError('first not supported for non-numeric data') +# else: +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = j + 1 +# elif tiebreak == TIEBREAK_FIRST_DESCENDING: +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = 2 * i - j - dups + 2 +# elif tiebreak == TIEBREAK_DENSE: +# total_tie_count += 1 +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = total_tie_count +# sum_ranks = dups = 0 +# +# else: +# with nogil: +# # TODO: why does the 2d version not have a nogil block? +# for i in range(n): +# sum_ranks += i + 1 +# dups += 1 +# +# val = sorted_data[i] +# +# if rank_t is not uint64_t: +# isnan = sorted_mask[i] +# if isnan and keep_na: +# ranks[argsorted[i]] = NaN +# continue +# +# count += 1.0 +# +# if rank_t is object: +# condition = ( +# i == n - 1 or +# are_diff(sorted_data[i + 1], val) or +# i == non_na_idx +# ) +# else: +# condition = ( +# i == n - 1 or +# sorted_data[i + 1] != val or +# i == non_na_idx +# ) +# +# if condition: +# +# if tiebreak == TIEBREAK_AVERAGE: +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = sum_ranks / dups +# elif tiebreak == TIEBREAK_MIN: +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = i - dups + 2 +# elif tiebreak == TIEBREAK_MAX: +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = i + 1 +# elif tiebreak == TIEBREAK_FIRST: +# if rank_t is object: +# raise ValueError('first not supported for non-numeric data') +# else: +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = j + 1 +# elif tiebreak == TIEBREAK_FIRST_DESCENDING: +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = 2 * i - j - dups + 2 +# elif tiebreak == TIEBREAK_DENSE: +# total_tie_count += 1 +# for j in range(i - dups + 1, i + 1): +# ranks[argsorted[j]] = total_tie_count +# sum_ranks = dups = 0 +# +# if pct: +# if tiebreak == TIEBREAK_DENSE: +# return ranks / total_tie_count +# else: +# return ranks / count +# else: +# return ranks @cython.boundscheck(False) +@cython.wraparound(False) def rank_1d( ndarray[rank_t, ndim=1] in_arr, ties_method="average", bint ascending=True, na_option="keep", bint pct=False, + labels=None, ): - """ - Fast NaN-friendly version of ``scipy.stats.rankdata``. - """ cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 - ndarray[rank_t] sorted_data, values - ndarray[float64_t] ranks - ndarray[int64_t] argsorted - ndarray[uint8_t, cast=True] sorted_mask - rank_t val, nan_value - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = False - bint isnan, condition - float64_t count = 0.0 + TiebreakEnumType tiebreak + Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 + ndarray[int64_t] _as + ndarray[float64_t] grp_sizes + ndarray[rank_t] masked_vals + ndarray[uint8_t] mask + bint keep_na + rank_t nan_fill_val + ndarray[float64_t] out + int64_t[:] labels_ tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' - if rank_t is float64_t: - values = np.asarray(in_arr).copy() - elif rank_t is object: - values = np.array(in_arr, copy=True) - - if values.dtype != np.object_: - values = values.astype('O') + N = in_arr.shape[0] + if labels is None: + labels_ = np.zeros(N, dtype="int") else: - values = np.asarray(in_arr).copy() + labels_ = labels - keep_na = na_option == 'keep' + out = np.empty(N) + grp_sizes = np.ones_like(in_arr, dtype="float") + + # Copy values into new array in order to fill missing data + # with mask, without obfuscating location of missing data + # in values array + masked_vals = np.array(in_arr, copy=True) + if rank_t is object and masked_vals.dtype != np.object_: + masked_vals = masked_vals.astype('O') if rank_t is object: - mask = missing.isnaobj(values) - elif rank_t is float64_t: - mask = np.isnan(values) + mask = missing.isnaobj(masked_vals) elif rank_t is int64_t: - mask = values == NPY_NAT - - # double sort first by mask and then by values to ensure nan values are - # either at the beginning or the end. mask/(~mask) controls padding at - # tail or the head - if rank_t is not uint64_t: - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_value = Infinity() - elif rank_t is float64_t: - nan_value = np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).max - - order = (values, mask) - else: - if rank_t is object: - nan_value = NegInfinity() - elif rank_t is float64_t: - nan_value = -np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).min - - order = (values, ~mask) - np.putmask(values, mask, nan_value) + mask = (masked_vals == NPY_NAT).astype(np.uint8) + elif rank_t is float64_t: + mask = np.isnan(masked_vals).astype(np.uint8) else: - mask = np.zeros(shape=len(values), dtype=bool) - order = (values, mask) - - n = len(values) - ranks = np.empty(n, dtype='f8') + mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) - if rank_t is object: - _as = np.lexsort(keys=order) + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_fill_val = Infinity() + elif rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).max + elif rank_t is uint64_t: + nan_fill_val = np.iinfo(np.uint64).max + else: + nan_fill_val = np.inf + order = (masked_vals, mask, labels_) else: - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = np.lexsort(keys=order) - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING + if rank_t is object: + nan_fill_val = NegInfinity() + elif rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).min + elif rank_t is uint64_t: + nan_fill_val = 0 else: - _as = np.lexsort(keys=order) + nan_fill_val = -np.inf - if not ascending: - _as = _as[::-1] - - sorted_data = values.take(_as) - sorted_mask = mask.take(_as) - _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] - non_na_idx = _indices[0] if len(_indices) > 0 else -1 - argsorted = _as.astype('i8') + order = (masked_vals, ~mask, labels_) - if rank_t is object: - # TODO: de-duplicate once cython supports conditional nogil - for i in range(n): - sum_ranks += i + 1 - dups += 1 + np.putmask(masked_vals, mask, nan_fill_val) - val = sorted_data[i] + # lexsort using labels, then mask, then actual values + # each label corresponds to a different group value, + # the mask helps you differentiate missing values before + # performing sort on the actual values + _as = np.lexsort(order).astype(np.int64, copy=False) - if rank_t is not uint64_t: - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue + if not ascending: + _as = _as[::-1] - count += 1.0 + # Loop over the length of the value array + # each incremental i value can be looked up in the _as array + # that we sorted previously, which gives us the location of + # that sorted value for retrieval back from the original + # values / masked_vals arrays + for i in range(N): + at_end = i == N - 1 + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change + # Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the starting index of the current group (grp_start) + # and the current index + if not at_end: if rank_t is object: - condition = ( - i == n - 1 or - are_diff(sorted_data[i + 1], val) or - i == non_na_idx - ) + next_val_diff = are_diff(masked_vals[_as[i]], masked_vals[_as[i+1]]) else: - condition = ( - i == n - 1 or - sorted_data[i + 1] != val or - i == non_na_idx - ) - - if condition: - - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for non-numeric data') + next_val_diff = masked_vals[_as[i]] != masked_vals[_as[i+1]] + else: + next_val_diff = 1 + + if (next_val_diff + or mask[_as[i]] ^ mask[_as[i+1]] + or labels_[_as[i]] != labels_[_as[i+1]] + ): + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and mask[_as[i]]: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j]] = j + 1 - grp_start else: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - - else: - with nogil: - # TODO: why does the 2d version not have a nogil block? - for i in range(n): - sum_ranks += i + 1 - dups += 1 - - val = sorted_data[i] - - if rank_t is not uint64_t: - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - - count += 1.0 - - if rank_t is object: - condition = ( - i == n - 1 or - are_diff(sorted_data[i + 1], val) or - i == non_na_idx - ) + out[_as[j]] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = grp_vals_seen + + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. the conditional also needs to handle nan equality + # and the end of iteration + if (next_val_diff or mask[_as[i]] ^ mask[_as[i+1]]): + dups = sum_ranks = 0 + grp_vals_seen += 1 + grp_tie_count += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. fill in the size of each + # group encountered (used by pct calculations later). also be + # sure to reset any of the items helping to calculate dups + if at_end or labels_[_as[i]] != labels_[_as[i+1]]: + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j]] = (i - grp_start + 1 - + grp_na_count) else: - condition = ( - i == n - 1 or - sorted_data[i + 1] != val or - i == non_na_idx - ) - - if condition: - - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for non-numeric data') - else: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 + for j in range(grp_start, i + 1): + grp_sizes[_as[j]] = (grp_tie_count - + (grp_na_count > 0)) + dups = sum_ranks = 0 + grp_na_count = 0 + grp_tie_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 if pct: - if tiebreak == TIEBREAK_DENSE: - return ranks / total_tie_count - else: - return ranks / count - else: - return ranks + for i in range(N): + # We don't include NaN values in percentage + # rankings, so we assign them percentages of NaN. + if out[i] != out[i] or out[i] == NaN: + out[i] = NaN + elif grp_sizes[i] != 0: + out[i] = out[i] / grp_sizes[i] + + return out def rank_2d( diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 5c4ba3b2729e3..65d2ba57b1c82 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -40,6 +40,7 @@ from pandas._libs.util cimport get_nat, numeric from pandas._libs.algos import ( groupsort_indexer, take_2d_axis1_float64_float64, + rank_1d, tiebreakers, ) @@ -1116,151 +1117,18 @@ def group_rank(float64_t[:, :] out, This method modifies the `out` parameter rather than returning an object """ cdef: - TiebreakEnumType tiebreak - Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 - ndarray[int64_t] _as - ndarray[float64_t, ndim=2] grp_sizes - ndarray[rank_t] masked_vals - ndarray[uint8_t] mask - bint keep_na - rank_t nan_fill_val - - if rank_t is object: - raise NotImplementedError("Cant do nogil") - - tiebreak = tiebreakers[ties_method] - keep_na = na_option == 'keep' - N, K = (values).shape - grp_sizes = np.ones_like(out) - - # Copy values into new array in order to fill missing data - # with mask, without obfuscating location of missing data - # in values array - masked_vals = np.array(values[:, 0], copy=True) - if rank_t is int64_t: - mask = (masked_vals == NPY_NAT).astype(np.uint8) - else: - mask = np.isnan(masked_vals).astype(np.uint8) - - if ascending ^ (na_option == 'top'): - if rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).max - elif rank_t is uint64_t: - nan_fill_val = np.iinfo(np.uint64).max - else: - nan_fill_val = np.inf - order = (masked_vals, mask, labels) - else: - if rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).min - elif rank_t is uint64_t: - nan_fill_val = 0 - else: - nan_fill_val = -np.inf - - order = (masked_vals, ~mask, labels) - np.putmask(masked_vals, mask, nan_fill_val) - - # lexsort using labels, then mask, then actual values - # each label corresponds to a different group value, - # the mask helps you differentiate missing values before - # performing sort on the actual values - _as = np.lexsort(order).astype(np.int64, copy=False) - - if not ascending: - _as = _as[::-1] - - with nogil: - # Loop over the length of the value array - # each incremental i value can be looked up in the _as array - # that we sorted previously, which gives us the location of - # that sorted value for retrieval back from the original - # values / masked_vals arrays - for i in range(N): - # dups and sum_ranks will be incremented each loop where - # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers - dups += 1 - sum_ranks += i - grp_start + 1 - - # Update out only when there is a transition of values or labels. - # When a new value or group is encountered, go back #dups steps( - # the number of occurrence of current value) and assign the ranks - # based on the starting index of the current group (grp_start) - # and the current index - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]]) or - (labels[_as[i]] != labels[_as[i+1]])): - # if keep_na, check for missing values and assign back - # to the result where appropriate - if keep_na and mask[_as[i]]: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - if ascending: - out[_as[j], 0] = j + 1 - grp_start - else: - out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = grp_vals_seen - - # look forward to the next value (using the sorting in _as) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality - # and the end of iteration - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]])): - dups = sum_ranks = 0 - grp_vals_seen += 1 - grp_tie_count += 1 - - # Similar to the previous conditional, check now if we are - # moving to a new group. If so, keep track of the index where - # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be - # sure to reset any of the items helping to calculate dups - if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - if tiebreak != TIEBREAK_DENSE: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (i - grp_start + 1 - - grp_na_count) - else: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (grp_tie_count - - (grp_na_count > 0)) - dups = sum_ranks = 0 - grp_na_count = 0 - grp_tie_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 - - if pct: - for i in range(N): - # We don't include NaN values in percentage - # rankings, so we assign them percentages of NaN. - if out[i, 0] != out[i, 0] or out[i, 0] == NAN: - out[i, 0] = NAN - elif grp_sizes[i, 0] != 0: - out[i, 0] = out[i, 0] / grp_sizes[i, 0] - + ndarray[float64_t] result + + result = rank_1d( + in_arr=values[:, 0], + labels=labels, + ties_method=ties_method, + ascending=ascending, + pct=pct, + na_option=na_option + ) + for i in range(result.shape[0]): + out[i, 0] = result[i] # ---------------------------------------------------------------------- # group_min, group_max From cc7b73f720fb7b03a064e3b7cf5ef75fecc11bb5 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 27 Dec 2020 00:36:33 -0500 Subject: [PATCH 02/15] wip --- pandas/_libs/algos.pyx | 15 +++++++-------- pandas/_libs/groupby.pyx | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a0cd052c36818..42bf6820ae545 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1017,13 +1017,12 @@ def rank_1d( TiebreakEnumType tiebreak Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 - ndarray[int64_t] _as - ndarray[float64_t] grp_sizes - ndarray[rank_t] masked_vals - ndarray[uint8_t] mask - bint keep_na + ndarray[int64_t, ndim=1] _as + ndarray[float64_t, ndim=1] grp_sizes, out + ndarray[rank_t, ndim=1] masked_vals + ndarray[uint8_t, ndim=1] mask + bint keep_na, at_end, next_val_diff rank_t nan_fill_val - ndarray[float64_t] out int64_t[:] labels_ tiebreak = tiebreakers[ties_method] @@ -1036,7 +1035,7 @@ def rank_1d( labels_ = labels out = np.empty(N) - grp_sizes = np.ones_like(in_arr, dtype="float") + grp_sizes = np.ones_like(out) # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data @@ -1149,7 +1148,7 @@ def rank_1d( # reset the dups and sum_ranks, knowing that a new value is # coming up. the conditional also needs to handle nan equality # and the end of iteration - if (next_val_diff or mask[_as[i]] ^ mask[_as[i+1]]): + if next_val_diff or mask[_as[i]] ^ mask[_as[i+1]]: dups = sum_ranks = 0 grp_vals_seen += 1 grp_tie_count += 1 diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 65d2ba57b1c82..0885c8cc346ec 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1117,7 +1117,7 @@ def group_rank(float64_t[:, :] out, This method modifies the `out` parameter rather than returning an object """ cdef: - ndarray[float64_t] result + ndarray[float64_t, ndim=1] result result = rank_1d( in_arr=values[:, 0], From 54e2397aeae5ec06cb55023716aff5d906e9e8a6 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 27 Dec 2020 00:57:20 -0500 Subject: [PATCH 03/15] wip --- asv_bench/benchmarks/groupby.py | 33 -------------------------- asv_bench/benchmarks/series_methods.py | 2 +- pandas/_libs/algos.pyx | 8 ++++--- 3 files changed, 6 insertions(+), 37 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 6ce63ff8badca..92301ec8ab1d1 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -384,40 +384,7 @@ class GroupByMethods: params = [ ["int", "float", "object", "datetime"], [ - "all", - "any", - "bfill", - "count", - "cumcount", - "cummax", - "cummin", - "cumprod", - "cumsum", - "describe", - "ffill", - "first", - "head", - "last", - "mad", - "max", - "min", - "median", - "mean", - "nunique", - "pct_change", - "prod", - "quantile", "rank", - "sem", - "shift", - "size", - "skew", - "std", - "sum", - "tail", - "unique", - "value_counts", - "var", ], ["direct", "transformation"], ] diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index b501fbd687cf2..5cf2029edb2cd 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -356,7 +356,7 @@ class Rank: ] def setup(self, dtype): - self.s = Series(np.random.randint(0, 1000, size=100000), dtype=dtype) + self.s = Series(np.random.randint(0, 1000, size=10000), dtype=dtype) def time_frame_quantile(self, dtype): self.s.rank() diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 42bf6820ae545..35d2c84ee11ae 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1021,7 +1021,7 @@ def rank_1d( ndarray[float64_t, ndim=1] grp_sizes, out ndarray[rank_t, ndim=1] masked_vals ndarray[uint8_t, ndim=1] mask - bint keep_na, at_end, next_val_diff + bint keep_na, at_end, next_val_diff, check_labels rank_t nan_fill_val int64_t[:] labels_ @@ -1030,8 +1030,10 @@ def rank_1d( N = in_arr.shape[0] if labels is None: + check_labels = 0 labels_ = np.zeros(N, dtype="int") else: + check_labels = 1 labels_ = labels out = np.empty(N) @@ -1116,7 +1118,7 @@ def rank_1d( if (next_val_diff or mask[_as[i]] ^ mask[_as[i+1]] - or labels_[_as[i]] != labels_[_as[i+1]] + or (check_labels and labels_[_as[i]] != labels_[_as[i+1]]) ): # if keep_na, check for missing values and assign back # to the result where appropriate @@ -1159,7 +1161,7 @@ def rank_1d( # decrement that from their position. fill in the size of each # group encountered (used by pct calculations later). also be # sure to reset any of the items helping to calculate dups - if at_end or labels_[_as[i]] != labels_[_as[i+1]]: + if at_end or (check_labels and labels_[_as[i]] != labels_[_as[i+1]]): if tiebreak != TIEBREAK_DENSE: for j in range(grp_start, i + 1): grp_sizes[_as[j]] = (i - grp_start + 1 - From f0b5edb3545f53d6d9284b3db7b23fb82b686ab2 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 27 Dec 2020 12:11:19 -0500 Subject: [PATCH 04/15] wip --- asv_bench/benchmarks/series_methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 5cf2029edb2cd..1e7b40b8617b7 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -356,9 +356,9 @@ class Rank: ] def setup(self, dtype): - self.s = Series(np.random.randint(0, 1000, size=10000), dtype=dtype) + self.s = Series(np.random.randint(0, 1000, size=100000), dtype=dtype) - def time_frame_quantile(self, dtype): + def time_rank(self, dtype): self.s.rank() From a0abb1a4b87a3bf1bd1b89d7277d70a5605029b9 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 27 Dec 2020 21:49:08 -0500 Subject: [PATCH 05/15] WIP --- asv_bench/benchmarks/frame_methods.py | 17 ++ asv_bench/benchmarks/groupby.py | 1 - asv_bench/benchmarks/series_methods.py | 1 + pandas/_libs/algos.pyx | 253 +++---------------------- pandas/_libs/groupby.pyx | 13 +- pandas/tests/groupby/test_rank.py | 1 + 6 files changed, 52 insertions(+), 234 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 70d90ded84545..98c884f25d30a 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -597,6 +597,23 @@ def time_frame_quantile(self, axis): self.df.quantile([0.1, 0.5], axis=axis) +class Rank: + param_names = ["dtype"] + params = [ + ["int", "uint", "float", "object"], + ] + + def setup(self, dtype): + self.df = DataFrame( + np.random.randn(10000, 10), + columns=range(10), + dtype=dtype + ) + + def time_rank(self, dtype): + self.df.rank() + + class GetDtypeCounts: # 2807 def setup(self): diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 92301ec8ab1d1..3c560e83c0189 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -29,7 +29,6 @@ "skew", "cumprod", "cummax", - "rank", "pct_change", "min", "var", diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 1e7b40b8617b7..b52c8142334be 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -348,6 +348,7 @@ def setup(self, func, N, dtype): def time_func(self, func, N, dtype): self.func() + class Rank: param_names = ["dtype"] diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 35d2c84ee11ae..0d993edfb27a3 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -789,222 +789,8 @@ ctypedef fused rank_t: int64_t -# @cython.wraparound(False) -# @cython.boundscheck(False) -# def rank_1d( -# ndarray[rank_t, ndim=1] in_arr, -# ties_method="average", -# bint ascending=True, -# na_option="keep", -# bint pct=False, -# ): -# """ -# Fast NaN-friendly version of ``scipy.stats.rankdata``. -# """ -# cdef: -# Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 -# ndarray[rank_t] sorted_data, values -# ndarray[float64_t] ranks -# ndarray[int64_t] argsorted -# ndarray[uint8_t, cast=True] sorted_mask -# rank_t val, nan_value -# float64_t sum_ranks = 0 -# int tiebreak = 0 -# bint keep_na = False -# bint isnan, condition -# float64_t count = 0.0 -# -# tiebreak = tiebreakers[ties_method] -# -# if rank_t is float64_t: -# values = np.asarray(in_arr).copy() -# elif rank_t is object: -# values = np.array(in_arr, copy=True) -# -# if values.dtype != np.object_: -# values = values.astype('O') -# else: -# values = np.asarray(in_arr).copy() -# -# keep_na = na_option == 'keep' -# -# if rank_t is object: -# mask = missing.isnaobj(values) -# elif rank_t is float64_t: -# mask = np.isnan(values) -# elif rank_t is int64_t: -# mask = values == NPY_NAT -# -# # double sort first by mask and then by values to ensure nan values are -# # either at the beginning or the end. mask/(~mask) controls padding at -# # tail or the head -# if rank_t is not uint64_t: -# if ascending ^ (na_option == 'top'): -# if rank_t is object: -# nan_value = Infinity() -# elif rank_t is float64_t: -# nan_value = np.inf -# elif rank_t is int64_t: -# nan_value = np.iinfo(np.int64).max -# -# order = (values, mask) -# else: -# if rank_t is object: -# nan_value = NegInfinity() -# elif rank_t is float64_t: -# nan_value = -np.inf -# elif rank_t is int64_t: -# nan_value = np.iinfo(np.int64).min -# -# order = (values, ~mask) -# np.putmask(values, mask, nan_value) -# else: -# mask = np.zeros(shape=len(values), dtype=bool) -# order = (values, mask) -# -# n = len(values) -# ranks = np.empty(n, dtype='f8') -# -# if rank_t is object: -# _as = np.lexsort(keys=order) -# else: -# if tiebreak == TIEBREAK_FIRST: -# # need to use a stable sort here -# _as = np.lexsort(keys=order) -# if not ascending: -# tiebreak = TIEBREAK_FIRST_DESCENDING -# else: -# _as = np.lexsort(keys=order) -# -# if not ascending: -# _as = _as[::-1] -# -# sorted_data = values.take(_as) -# sorted_mask = mask.take(_as) -# _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] -# non_na_idx = _indices[0] if len(_indices) > 0 else -1 -# argsorted = _as.astype('i8') -# -# if rank_t is object: -# # TODO: de-duplicate once cython supports conditional nogil -# for i in range(n): -# sum_ranks += i + 1 -# dups += 1 -# -# val = sorted_data[i] -# -# if rank_t is not uint64_t: -# isnan = sorted_mask[i] -# if isnan and keep_na: -# ranks[argsorted[i]] = NaN -# continue -# -# count += 1.0 -# -# if rank_t is object: -# condition = ( -# i == n - 1 or -# are_diff(sorted_data[i + 1], val) or -# i == non_na_idx -# ) -# else: -# condition = ( -# i == n - 1 or -# sorted_data[i + 1] != val or -# i == non_na_idx -# ) -# -# if condition: -# -# if tiebreak == TIEBREAK_AVERAGE: -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = sum_ranks / dups -# elif tiebreak == TIEBREAK_MIN: -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = i - dups + 2 -# elif tiebreak == TIEBREAK_MAX: -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = i + 1 -# elif tiebreak == TIEBREAK_FIRST: -# if rank_t is object: -# raise ValueError('first not supported for non-numeric data') -# else: -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = j + 1 -# elif tiebreak == TIEBREAK_FIRST_DESCENDING: -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = 2 * i - j - dups + 2 -# elif tiebreak == TIEBREAK_DENSE: -# total_tie_count += 1 -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = total_tie_count -# sum_ranks = dups = 0 -# -# else: -# with nogil: -# # TODO: why does the 2d version not have a nogil block? -# for i in range(n): -# sum_ranks += i + 1 -# dups += 1 -# -# val = sorted_data[i] -# -# if rank_t is not uint64_t: -# isnan = sorted_mask[i] -# if isnan and keep_na: -# ranks[argsorted[i]] = NaN -# continue -# -# count += 1.0 -# -# if rank_t is object: -# condition = ( -# i == n - 1 or -# are_diff(sorted_data[i + 1], val) or -# i == non_na_idx -# ) -# else: -# condition = ( -# i == n - 1 or -# sorted_data[i + 1] != val or -# i == non_na_idx -# ) -# -# if condition: -# -# if tiebreak == TIEBREAK_AVERAGE: -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = sum_ranks / dups -# elif tiebreak == TIEBREAK_MIN: -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = i - dups + 2 -# elif tiebreak == TIEBREAK_MAX: -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = i + 1 -# elif tiebreak == TIEBREAK_FIRST: -# if rank_t is object: -# raise ValueError('first not supported for non-numeric data') -# else: -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = j + 1 -# elif tiebreak == TIEBREAK_FIRST_DESCENDING: -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = 2 * i - j - dups + 2 -# elif tiebreak == TIEBREAK_DENSE: -# total_tie_count += 1 -# for j in range(i - dups + 1, i + 1): -# ranks[argsorted[j]] = total_tie_count -# sum_ranks = dups = 0 -# -# if pct: -# if tiebreak == TIEBREAK_DENSE: -# return ranks / total_tie_count -# else: -# return ranks / count -# else: -# return ranks -@cython.boundscheck(False) @cython.wraparound(False) +@cython.boundscheck(False) def rank_1d( ndarray[rank_t, ndim=1] in_arr, ties_method="average", @@ -1013,9 +799,34 @@ def rank_1d( bint pct=False, labels=None, ): + """ + Fast NaN-friendly version of ``scipy.stats.rankdata``. + + Parameters + ---------- + in_arr : array of rank_t values to be ranked + ties_method : {'average', 'min', 'max', 'first', 'dense'}, default + 'average' + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + ascending : boolean, default True + False for ranks by high (1) to low (N) + na_option : {'keep', 'top', 'bottom'}, default 'keep' + na_option : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + pct : boolean, default False + Compute percentage rank of data within each group + labels : optional array containing group labels (used only when called + from group_rank()) + """ cdef: TiebreakEnumType tiebreak - Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0 Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 ndarray[int64_t, ndim=1] _as ndarray[float64_t, ndim=1] grp_sizes, out @@ -1028,16 +839,15 @@ def rank_1d( tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' - N = in_arr.shape[0] + N = len(in_arr) + check_labels = labels is not None if labels is None: - check_labels = 0 labels_ = np.zeros(N, dtype="int") else: - check_labels = 1 labels_ = labels out = np.empty(N) - grp_sizes = np.ones_like(out) + grp_sizes = np.ones(N) # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data @@ -1088,7 +898,6 @@ def rank_1d( if not ascending: _as = _as[::-1] - # Loop over the length of the value array # each incremental i value can be looked up in the _as array # that we sorted previously, which gives us the location of @@ -1114,7 +923,7 @@ def rank_1d( else: next_val_diff = masked_vals[_as[i]] != masked_vals[_as[i+1]] else: - next_val_diff = 1 + next_val_diff = True if (next_val_diff or mask[_as[i]] ^ mask[_as[i+1]] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 0885c8cc346ec..135632c1d4c0e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -26,22 +26,13 @@ from numpy.math cimport NAN cnp.import_array() -from pandas._libs.algos cimport ( - TIEBREAK_AVERAGE, - TIEBREAK_DENSE, - TIEBREAK_FIRST, - TIEBREAK_MAX, - TIEBREAK_MIN, - TiebreakEnumType, - swap, -) +from pandas._libs.algos cimport swap from pandas._libs.util cimport get_nat, numeric from pandas._libs.algos import ( groupsort_indexer, take_2d_axis1_float64_float64, rank_1d, - tiebreakers, ) from pandas._libs.missing cimport checknull @@ -1127,7 +1118,7 @@ def group_rank(float64_t[:, :] out, pct=pct, na_option=na_option ) - for i in range(result.shape[0]): + for i in range(len(result)): out[i, 0] = result[i] # ---------------------------------------------------------------------- diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index ef6b4ae4836f8..f2046c5768668 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -444,6 +444,7 @@ def test_rank_avg_even_vals(): tm.assert_frame_equal(result, exp_df) +@pytest.mark.xfail(reason="Works now, needs tests") @pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) From 7db40c3dd5d280797b4ebc8f7c6bbcf5a818b1e7 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 27 Dec 2020 22:45:10 -0500 Subject: [PATCH 06/15] REF/POC: share groupby/series algos (rank) --- asv_bench/benchmarks/groupby.py | 33 +++++++++++++++++++++++++++++++++ pandas/_libs/algos.pyx | 16 ++++++++-------- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 3c560e83c0189..bf210352bcb5d 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -383,7 +383,40 @@ class GroupByMethods: params = [ ["int", "float", "object", "datetime"], [ + "all", + "any", + "bfill", + "count", + "cumcount", + "cummax", + "cummin", + "cumprod", + "cumsum", + "describe", + "ffill", + "first", + "head", + "last", + "mad", + "max", + "min", + "median", + "mean", + "nunique", + "pct_change", + "prod", + "quantile", "rank", + "sem", + "shift", + "size", + "skew", + "std", + "sum", + "tail", + "unique", + "value_counts", + "var", ], ["direct", "transformation"], ] diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 0d993edfb27a3..d5a6b69cdd643 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -801,7 +801,7 @@ def rank_1d( ): """ Fast NaN-friendly version of ``scipy.stats.rankdata``. - + Parameters ---------- in_arr : array of rank_t values to be ranked @@ -840,15 +840,15 @@ def rank_1d( keep_na = na_option == 'keep' N = len(in_arr) + out = np.empty(N) + grp_sizes = np.ones(N) + check_labels = labels is not None if labels is None: labels_ = np.zeros(N, dtype="int") else: labels_ = labels - out = np.empty(N) - grp_sizes = np.ones(N) - # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array @@ -926,8 +926,8 @@ def rank_1d( next_val_diff = True if (next_val_diff - or mask[_as[i]] ^ mask[_as[i+1]] - or (check_labels and labels_[_as[i]] != labels_[_as[i+1]]) + or (mask[_as[i]] ^ mask[_as[i+1]]) + or (check_labels and (labels_[_as[i]] != labels_[_as[i+1]])) ): # if keep_na, check for missing values and assign back # to the result where appropriate @@ -959,7 +959,7 @@ def rank_1d( # reset the dups and sum_ranks, knowing that a new value is # coming up. the conditional also needs to handle nan equality # and the end of iteration - if next_val_diff or mask[_as[i]] ^ mask[_as[i+1]]: + if next_val_diff or (mask[_as[i]] ^ mask[_as[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 grp_tie_count += 1 @@ -970,7 +970,7 @@ def rank_1d( # decrement that from their position. fill in the size of each # group encountered (used by pct calculations later). also be # sure to reset any of the items helping to calculate dups - if at_end or (check_labels and labels_[_as[i]] != labels_[_as[i+1]]): + if at_end or (check_labels and (labels_[_as[i]] != labels_[_as[i+1]])): if tiebreak != TIEBREAK_DENSE: for j in range(grp_start, i + 1): grp_sizes[_as[j]] = (i - grp_start + 1 - From 5cd81d253768a034873e9faaaed10e969569c215 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 27 Dec 2020 23:09:12 -0500 Subject: [PATCH 07/15] Fix precommit --- asv_bench/benchmarks/frame_methods.py | 6 +----- pandas/_libs/algos.pyx | 6 ++---- pandas/_libs/groupby.pyx | 7 ++----- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 98c884f25d30a..7386b0b903afd 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -604,11 +604,7 @@ class Rank: ] def setup(self, dtype): - self.df = DataFrame( - np.random.randn(10000, 10), - columns=range(10), - dtype=dtype - ) + self.df = DataFrame(np.random.randn(10000, 10), columns=range(10), dtype=dtype) def time_rank(self, dtype): self.df.rank() diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d5a6b69cdd643..53deeff76a45a 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -973,12 +973,10 @@ def rank_1d( if at_end or (check_labels and (labels_[_as[i]] != labels_[_as[i+1]])): if tiebreak != TIEBREAK_DENSE: for j in range(grp_start, i + 1): - grp_sizes[_as[j]] = (i - grp_start + 1 - - grp_na_count) + grp_sizes[_as[j]] = (i - grp_start + 1 - grp_na_count) else: for j in range(grp_start, i + 1): - grp_sizes[_as[j]] = (grp_tie_count - - (grp_na_count > 0)) + grp_sizes[_as[j]] = (grp_tie_count - (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 grp_tie_count = 0 diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 135632c1d4c0e..028b77702f154 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -29,11 +29,7 @@ cnp.import_array() from pandas._libs.algos cimport swap from pandas._libs.util cimport get_nat, numeric -from pandas._libs.algos import ( - groupsort_indexer, - take_2d_axis1_float64_float64, - rank_1d, -) +from pandas._libs.algos import groupsort_indexer, rank_1d, take_2d_axis1_float64_float64 from pandas._libs.missing cimport checknull @@ -1121,6 +1117,7 @@ def group_rank(float64_t[:, :] out, for i in range(len(result)): out[i, 0] = result[i] + # ---------------------------------------------------------------------- # group_min, group_max # ---------------------------------------------------------------------- From 2a53d7c8668a3abbb36e80634ec475785ba39e97 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 28 Dec 2020 01:42:44 -0500 Subject: [PATCH 08/15] Fix dtype --- pandas/_libs/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 53deeff76a45a..4f9d844fc052d 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -845,7 +845,7 @@ def rank_1d( check_labels = labels is not None if labels is None: - labels_ = np.zeros(N, dtype="int") + labels_ = np.zeros(N, dtype=np.int64) else: labels_ = labels From 6119f4d8919c99c04e64a1f1dfad4b9ebc9bfeab Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 28 Dec 2020 22:32:46 -0500 Subject: [PATCH 09/15] Add gil block, always pass labels --- pandas/_libs/algos.pyx | 286 ++++++++++++++++++++++++------------- pandas/core/algorithms.py | 1 + pandas/tests/test_algos.py | 2 +- 3 files changed, 192 insertions(+), 97 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 4f9d844fc052d..fa10c45dcfa29 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -328,8 +328,11 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr ndarray[uint8_t, ndim=2] mask int64_t nobs = 0 float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor + const int64_t[:] labels_n, labels_nobs N, K = (mat).shape + # For compatibility when calling rank_1d + labels_n = np.zeros(N, dtype=np.int64) result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) @@ -337,7 +340,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr ranked_mat = np.empty((N, K), dtype=np.float64) for i in range(K): - ranked_mat[:, i] = rank_1d(mat[:, i]) + ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n) for xi in range(K): for yi in range(xi + 1): @@ -363,8 +366,9 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr j += 1 if not all_ranks: - maskedx = rank_1d(maskedx) - maskedy = rank_1d(maskedy) + labels_nobs = np.zeros(nobs, dtype=np.int64) + maskedx = rank_1d(maskedx, labels=labels_nobs) + maskedy = rank_1d(maskedy, labels=labels_nobs) mean = (nobs + 1) / 2. @@ -793,11 +797,11 @@ ctypedef fused rank_t: @cython.boundscheck(False) def rank_1d( ndarray[rank_t, ndim=1] in_arr, + const int64_t[:] labels, ties_method="average", bint ascending=True, - na_option="keep", bint pct=False, - labels=None, + na_option="keep", ): """ Fast NaN-friendly version of ``scipy.stats.rankdata``. @@ -805,6 +809,9 @@ def rank_1d( Parameters ---------- in_arr : array of rank_t values to be ranked + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values`. If not called + from a groupby operation, will be an array of 0's ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' * average: average rank of group @@ -815,14 +822,12 @@ def rank_1d( ascending : boolean, default True False for ranks by high (1) to low (N) na_option : {'keep', 'top', 'bottom'}, default 'keep' + pct : boolean, default False + Compute percentage rank of data within each group na_option : {'keep', 'top', 'bottom'}, default 'keep' * keep: leave NA values where they are * top: smallest rank if ascending * bottom: smallest rank if descending - pct : boolean, default False - Compute percentage rank of data within each group - labels : optional array containing group labels (used only when called - from group_rank()) """ cdef: TiebreakEnumType tiebreak @@ -834,7 +839,6 @@ def rank_1d( ndarray[uint8_t, ndim=1] mask bint keep_na, at_end, next_val_diff, check_labels rank_t nan_fill_val - int64_t[:] labels_ tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' @@ -842,12 +846,9 @@ def rank_1d( N = len(in_arr) out = np.empty(N) grp_sizes = np.ones(N) - - check_labels = labels is not None - if labels is None: - labels_ = np.zeros(N, dtype=np.int64) - else: - labels_ = labels + # If all 0 labels, can short-circuit later label + # comparisons + check_labels = np.count_nonzero(labels) != 0 # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data @@ -874,7 +875,7 @@ def rank_1d( nan_fill_val = np.iinfo(np.uint64).max else: nan_fill_val = np.inf - order = (masked_vals, mask, labels_) + order = (masked_vals, mask, labels) else: if rank_t is object: nan_fill_val = NegInfinity() @@ -885,7 +886,7 @@ def rank_1d( else: nan_fill_val = -np.inf - order = (masked_vals, ~mask, labels_) + order = (masked_vals, ~mask, labels) np.putmask(masked_vals, mask, nan_fill_val) @@ -903,85 +904,170 @@ def rank_1d( # that we sorted previously, which gives us the location of # that sorted value for retrieval back from the original # values / masked_vals arrays - for i in range(N): - at_end = i == N - 1 - # dups and sum_ranks will be incremented each loop where - # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers - dups += 1 - sum_ranks += i - grp_start + 1 - - # Update out only when there is a transition of values or labels. - # When a new value or group is encountered, go back #dups steps( - # the number of occurrence of current value) and assign the ranks - # based on the starting index of the current group (grp_start) - # and the current index - if not at_end: - if rank_t is object: - next_val_diff = are_diff(masked_vals[_as[i]], masked_vals[_as[i+1]]) + # TODO: de-duplicate once cython supports conditional nogil + if rank_t is object: + for i in range(N): + at_end = i == N - 1 + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change + # Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the starting index of the current group (grp_start) + # and the current index + if not at_end: + if rank_t is object: + next_val_diff = are_diff(masked_vals[_as[i]], masked_vals[_as[i+1]]) + else: + next_val_diff = masked_vals[_as[i]] != masked_vals[_as[i+1]] else: - next_val_diff = masked_vals[_as[i]] != masked_vals[_as[i+1]] - else: - next_val_diff = True - - if (next_val_diff - or (mask[_as[i]] ^ mask[_as[i+1]]) - or (check_labels and (labels_[_as[i]] != labels_[_as[i+1]])) - ): - # if keep_na, check for missing values and assign back - # to the result where appropriate - if keep_na and mask[_as[i]]: - for j in range(i - dups + 1, i + 1): - out[_as[j]] = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[_as[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[_as[j]] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[_as[j]] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - if ascending: - out[_as[j]] = j + 1 - grp_start + next_val_diff = True + + if (next_val_diff + or (mask[_as[i]] ^ mask[_as[i+1]]) + or (check_labels and (labels[_as[i]] != labels[_as[i+1]])) + ): + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and mask[_as[i]]: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j]] = j + 1 - grp_start + else: + out[_as[j]] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = grp_vals_seen + + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. the conditional also needs to handle nan equality + # and the end of iteration + if next_val_diff or (mask[_as[i]] ^ mask[_as[i+1]]): + dups = sum_ranks = 0 + grp_vals_seen += 1 + grp_tie_count += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. fill in the size of each + # group encountered (used by pct calculations later). also be + # sure to reset any of the items helping to calculate dups + if at_end or (check_labels and (labels[_as[i]] != labels[_as[i+1]])): + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j]] = (i - grp_start + 1 - grp_na_count) else: - out[_as[j]] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[_as[j]] = grp_vals_seen - - # look forward to the next value (using the sorting in _as) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality - # and the end of iteration - if next_val_diff or (mask[_as[i]] ^ mask[_as[i+1]]): - dups = sum_ranks = 0 - grp_vals_seen += 1 - grp_tie_count += 1 - - # Similar to the previous conditional, check now if we are - # moving to a new group. If so, keep track of the index where - # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be - # sure to reset any of the items helping to calculate dups - if at_end or (check_labels and (labels_[_as[i]] != labels_[_as[i+1]])): - if tiebreak != TIEBREAK_DENSE: - for j in range(grp_start, i + 1): - grp_sizes[_as[j]] = (i - grp_start + 1 - grp_na_count) + for j in range(grp_start, i + 1): + grp_sizes[_as[j]] = (grp_tie_count - (grp_na_count > 0)) + dups = sum_ranks = 0 + grp_na_count = 0 + grp_tie_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 + else: + with nogil: + for i in range(N): + at_end = i == N - 1 + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change + # Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the starting index of the current group (grp_start) + # and the current index + if not at_end: + if rank_t is object: + next_val_diff = are_diff(masked_vals[_as[i]], + masked_vals[_as[i+1]]) + else: + next_val_diff = masked_vals[_as[i]] != masked_vals[_as[i+1]] else: - for j in range(grp_start, i + 1): - grp_sizes[_as[j]] = (grp_tie_count - (grp_na_count > 0)) - dups = sum_ranks = 0 - grp_na_count = 0 - grp_tie_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 + next_val_diff = True + + if (next_val_diff + or (mask[_as[i]] ^ mask[_as[i+1]]) + or (check_labels and (labels[_as[i]] != labels[_as[i+1]])) + ): + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and mask[_as[i]]: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j]] = j + 1 - grp_start + else: + out[_as[j]] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j]] = grp_vals_seen + + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. the conditional also needs to handle nan equality + # and the end of iteration + if next_val_diff or (mask[_as[i]] ^ mask[_as[i+1]]): + dups = sum_ranks = 0 + grp_vals_seen += 1 + grp_tie_count += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. fill in the size of each + # group encountered (used by pct calculations later). also be + # sure to reset any of the items helping to calculate dups + if at_end or (check_labels and + (labels[_as[i]] != labels[_as[i+1]])): + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j]] = (i - grp_start + 1 - grp_na_count) + else: + for j in range(grp_start, i + 1): + grp_sizes[_as[j]] = (grp_tie_count - (grp_na_count > 0)) + dups = sum_ranks = 0 + grp_na_count = 0 + grp_tie_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 if pct: for i in range(N): @@ -1018,6 +1104,7 @@ def rank_2d( bint keep_na = False float64_t count = 0.0 bint condition, skip_condition + const int64_t[:] labels tiebreak = tiebreakers[ties_method] @@ -1060,6 +1147,8 @@ def rank_2d( n, k = (values).shape ranks = np.empty((n, k), dtype='f8') + # For compatibility when calling rank_1d + labels = np.zeros(k, dtype=np.int64) if rank_t is object: try: @@ -1067,8 +1156,13 @@ def rank_2d( except TypeError: values = in_arr for i in range(len(values)): - ranks[i] = rank_1d(in_arr[i], ties_method=ties_method, - ascending=ascending, pct=pct) + ranks[i] = rank_1d( + in_arr[i], + labels=labels, + ties_method=ties_method, + ascending=ascending, + pct=pct + ) if axis == 0: return ranks.T else: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2098392cf70a9..138fee104f396 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -992,6 +992,7 @@ def rank( values = _get_values_for_rank(values) ranks = algos.rank_1d( values, + labels=np.zeros(len(values), dtype=np.int64), ties_method=method, ascending=ascending, na_option=na_option, diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 8fcc241348f27..3e26ac3d83faa 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1733,7 +1733,7 @@ def test_scipy_compat(self): def _check(arr): mask = ~np.isfinite(arr) arr = arr.copy() - result = libalgos.rank_1d(arr) + result = libalgos.rank_1d(arr, labels=np.zeros(len(arr), dtype=np.int64)) arr[mask] = np.inf exp = rankdata(arr) exp[mask] = np.nan From 0ab6b0f58f296607c29d0a7fb7149987ab808c9b Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 30 Dec 2020 22:04:58 -0500 Subject: [PATCH 10/15] Address comments --- pandas/_libs/algos.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index bb2e9bcddf6a2..299492feead64 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -844,6 +844,7 @@ def rank_1d( keep_na = na_option == 'keep' N = len(in_arr) + assert(len(labels) == N) out = np.empty(N) grp_sizes = np.ones(N) # If all 0 labels, can short-circuit later label @@ -853,9 +854,10 @@ def rank_1d( # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array - masked_vals = np.array(in_arr, copy=True) - if rank_t is object and masked_vals.dtype != np.object_: - masked_vals = masked_vals.astype('O') + if rank_t is object and in_arr.dtype != np.object_: + masked_vals = in_arr.astype('O') + else: + masked_vals = in_arr.copy() if rank_t is object: mask = missing.isnaobj(masked_vals) From 68db11ff6eed11b509a0f82f96abaf0f39cb2b6f Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 31 Dec 2020 11:36:00 -0500 Subject: [PATCH 11/15] Try a cast first --- pandas/_libs/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 299492feead64..dfc1ab1bdf08f 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -844,7 +844,7 @@ def rank_1d( keep_na = na_option == 'keep' N = len(in_arr) - assert(len(labels) == N) + assert(len(labels) == N) out = np.empty(N) grp_sizes = np.ones(N) # If all 0 labels, can short-circuit later label From b84b44f3edf5d5a5186352d9c3af979968e6448a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 31 Dec 2020 12:52:30 -0500 Subject: [PATCH 12/15] Address comments --- pandas/_libs/algos.pyx | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index dfc1ab1bdf08f..7b47a6d742855 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -35,6 +35,7 @@ from numpy cimport ( cnp.import_array() +from numpy.math cimport NAN cimport pandas._libs.util as util from pandas._libs.khash cimport ( @@ -840,11 +841,18 @@ def rank_1d( bint keep_na, at_end, next_val_diff, check_labels rank_t nan_fill_val + # print(NAN) + # print(NaN) + # print("NAN eq?") + # print(NAN == np.inf) + # print(NAN == -np.inf) + tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' N = len(in_arr) - assert(len(labels) == N) + # TODO Cython 3.0: cast won't be necessary (#2992) + assert len(labels) == N out = np.empty(N) grp_sizes = np.ones(N) # If all 0 labels, can short-circuit later label @@ -923,10 +931,7 @@ def rank_1d( # based on the starting index of the current group (grp_start) # and the current index if not at_end: - if rank_t is object: - next_val_diff = are_diff(masked_vals[_as[i]], masked_vals[_as[i+1]]) - else: - next_val_diff = masked_vals[_as[i]] != masked_vals[_as[i+1]] + next_val_diff = are_diff(masked_vals[_as[i]], masked_vals[_as[i+1]]) else: next_val_diff = True @@ -1004,11 +1009,7 @@ def rank_1d( # based on the starting index of the current group (grp_start) # and the current index if not at_end: - if rank_t is object: - next_val_diff = are_diff(masked_vals[_as[i]], - masked_vals[_as[i+1]]) - else: - next_val_diff = masked_vals[_as[i]] != masked_vals[_as[i+1]] + next_val_diff = masked_vals[_as[i]] != masked_vals[_as[i+1]] else: next_val_diff = True @@ -1073,11 +1074,7 @@ def rank_1d( if pct: for i in range(N): - # We don't include NaN values in percentage - # rankings, so we assign them percentages of NaN. - if out[i] != out[i] or out[i] == NaN: - out[i] = NaN - elif grp_sizes[i] != 0: + if grp_sizes[i] != 0: out[i] = out[i] / grp_sizes[i] return out From a9c7f4f591e0d2fe8ef4c763c6adf1fdffe08c8f Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 31 Dec 2020 12:58:21 -0500 Subject: [PATCH 13/15] Clean --- pandas/_libs/algos.pyx | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7b47a6d742855..5bfb51b726325 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -35,8 +35,6 @@ from numpy cimport ( cnp.import_array() -from numpy.math cimport NAN - cimport pandas._libs.util as util from pandas._libs.khash cimport ( kh_destroy_int64, @@ -841,12 +839,6 @@ def rank_1d( bint keep_na, at_end, next_val_diff, check_labels rank_t nan_fill_val - # print(NAN) - # print(NaN) - # print("NAN eq?") - # print(NAN == np.inf) - # print(NAN == -np.inf) - tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' From 54e786a7c8345d73f092355a322bf06f8aec7647 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 31 Dec 2020 13:23:02 -0500 Subject: [PATCH 14/15] Use any instead of count_nonzero --- pandas/_libs/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 5bfb51b726325..54526cc0c8d2e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -849,7 +849,7 @@ def rank_1d( grp_sizes = np.ones(N) # If all 0 labels, can short-circuit later label # comparisons - check_labels = np.count_nonzero(labels) != 0 + check_labels = np.any(labels) # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data From f81677b4a97756b0addd76a5d240ebbb0a74cc01 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 31 Dec 2020 15:27:46 -0500 Subject: [PATCH 15/15] Use clearer naming --- pandas/_libs/algos.pyx | 111 ++++++++++++++++++++++----------------- pandas/_libs/groupby.pyx | 2 +- 2 files changed, 65 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 54526cc0c8d2e..3aa4738b36dc8 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -795,7 +795,7 @@ ctypedef fused rank_t: @cython.wraparound(False) @cython.boundscheck(False) def rank_1d( - ndarray[rank_t, ndim=1] in_arr, + ndarray[rank_t, ndim=1] values, const int64_t[:] labels, ties_method="average", bint ascending=True, @@ -807,7 +807,7 @@ def rank_1d( Parameters ---------- - in_arr : array of rank_t values to be ranked + values : array of rank_t values to be ranked labels : array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. If not called from a groupby operation, will be an array of 0's @@ -832,7 +832,7 @@ def rank_1d( TiebreakEnumType tiebreak Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0 Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 - ndarray[int64_t, ndim=1] _as + ndarray[int64_t, ndim=1] lexsort_indexer ndarray[float64_t, ndim=1] grp_sizes, out ndarray[rank_t, ndim=1] masked_vals ndarray[uint8_t, ndim=1] mask @@ -842,7 +842,7 @@ def rank_1d( tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' - N = len(in_arr) + N = len(values) # TODO Cython 3.0: cast won't be necessary (#2992) assert len(labels) == N out = np.empty(N) @@ -854,10 +854,10 @@ def rank_1d( # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array - if rank_t is object and in_arr.dtype != np.object_: - masked_vals = in_arr.astype('O') + if rank_t is object and values.dtype != np.object_: + masked_vals = values.astype('O') else: - masked_vals = in_arr.copy() + masked_vals = values.copy() if rank_t is object: mask = missing.isnaobj(masked_vals) @@ -896,14 +896,14 @@ def rank_1d( # each label corresponds to a different group value, # the mask helps you differentiate missing values before # performing sort on the actual values - _as = np.lexsort(order).astype(np.int64, copy=False) + lexsort_indexer = np.lexsort(order).astype(np.int64, copy=False) if not ascending: - _as = _as[::-1] + lexsort_indexer = lexsort_indexer[::-1] # Loop over the length of the value array - # each incremental i value can be looked up in the _as array - # that we sorted previously, which gives us the location of + # each incremental i value can be looked up in the lexsort_indexer + # array that we sorted previously, which gives us the location of # that sorted value for retrieval back from the original # values / masked_vals arrays # TODO: de-duplicate once cython supports conditional nogil @@ -923,45 +923,49 @@ def rank_1d( # based on the starting index of the current group (grp_start) # and the current index if not at_end: - next_val_diff = are_diff(masked_vals[_as[i]], masked_vals[_as[i+1]]) + next_val_diff = are_diff(masked_vals[lexsort_indexer[i]], + masked_vals[lexsort_indexer[i+1]]) else: next_val_diff = True if (next_val_diff - or (mask[_as[i]] ^ mask[_as[i+1]]) - or (check_labels and (labels[_as[i]] != labels[_as[i+1]])) + or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) + or (check_labels + and (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])) ): # if keep_na, check for missing values and assign back # to the result where appropriate - if keep_na and mask[_as[i]]: + if keep_na and mask[lexsort_indexer[i]]: for j in range(i - dups + 1, i + 1): - out[_as[j]] = NaN + out[lexsort_indexer[j]] = NaN grp_na_count = dups elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): - out[_as[j]] = sum_ranks / dups + out[lexsort_indexer[j]] = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): - out[_as[j]] = i - grp_start - dups + 2 + out[lexsort_indexer[j]] = i - grp_start - dups + 2 elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): - out[_as[j]] = i - grp_start + 1 + out[lexsort_indexer[j]] = i - grp_start + 1 elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): if ascending: - out[_as[j]] = j + 1 - grp_start + out[lexsort_indexer[j]] = j + 1 - grp_start else: - out[_as[j]] = 2 * i - j - dups + 2 - grp_start + out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): - out[_as[j]] = grp_vals_seen + out[lexsort_indexer[j]] = grp_vals_seen # look forward to the next value (using the sorting in _as) # if the value does not equal the current value then we need to # reset the dups and sum_ranks, knowing that a new value is # coming up. the conditional also needs to handle nan equality # and the end of iteration - if next_val_diff or (mask[_as[i]] ^ mask[_as[i+1]]): + if next_val_diff or (mask[lexsort_indexer[i]] + ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 grp_tie_count += 1 @@ -972,13 +976,18 @@ def rank_1d( # decrement that from their position. fill in the size of each # group encountered (used by pct calculations later). also be # sure to reset any of the items helping to calculate dups - if at_end or (check_labels and (labels[_as[i]] != labels[_as[i+1]])): + if (at_end or + (check_labels + and (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]]))): if tiebreak != TIEBREAK_DENSE: for j in range(grp_start, i + 1): - grp_sizes[_as[j]] = (i - grp_start + 1 - grp_na_count) + grp_sizes[lexsort_indexer[j]] = \ + (i - grp_start + 1 - grp_na_count) else: for j in range(grp_start, i + 1): - grp_sizes[_as[j]] = (grp_tie_count - (grp_na_count > 0)) + grp_sizes[lexsort_indexer[j]] = \ + (grp_tie_count - (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 grp_tie_count = 0 @@ -1001,45 +1010,50 @@ def rank_1d( # based on the starting index of the current group (grp_start) # and the current index if not at_end: - next_val_diff = masked_vals[_as[i]] != masked_vals[_as[i+1]] + next_val_diff = (masked_vals[lexsort_indexer[i]] + != masked_vals[lexsort_indexer[i+1]]) else: next_val_diff = True if (next_val_diff - or (mask[_as[i]] ^ mask[_as[i+1]]) - or (check_labels and (labels[_as[i]] != labels[_as[i+1]])) + or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) + or (check_labels + and (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])) ): # if keep_na, check for missing values and assign back # to the result where appropriate - if keep_na and mask[_as[i]]: + if keep_na and mask[lexsort_indexer[i]]: for j in range(i - dups + 1, i + 1): - out[_as[j]] = NaN + out[lexsort_indexer[j]] = NaN grp_na_count = dups elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): - out[_as[j]] = sum_ranks / dups + out[lexsort_indexer[j]] = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): - out[_as[j]] = i - grp_start - dups + 2 + out[lexsort_indexer[j]] = i - grp_start - dups + 2 elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): - out[_as[j]] = i - grp_start + 1 + out[lexsort_indexer[j]] = i - grp_start + 1 elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): if ascending: - out[_as[j]] = j + 1 - grp_start + out[lexsort_indexer[j]] = j + 1 - grp_start else: - out[_as[j]] = 2 * i - j - dups + 2 - grp_start + out[lexsort_indexer[j]] = \ + (2 * i - j - dups + 2 - grp_start) elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): - out[_as[j]] = grp_vals_seen - - # look forward to the next value (using the sorting in _as) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality - # and the end of iteration - if next_val_diff or (mask[_as[i]] ^ mask[_as[i+1]]): + out[lexsort_indexer[j]] = grp_vals_seen + + # look forward to the next value (using the sorting in + # lexsort_indexer) if the value does not equal the current + # value then we need to reset the dups and sum_ranks, + # knowing that a new value is coming up. the conditional + # also needs to handle nan equality and the end of iteration + if next_val_diff or (mask[lexsort_indexer[i]] + ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 grp_tie_count += 1 @@ -1051,13 +1065,16 @@ def rank_1d( # group encountered (used by pct calculations later). also be # sure to reset any of the items helping to calculate dups if at_end or (check_labels and - (labels[_as[i]] != labels[_as[i+1]])): + (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])): if tiebreak != TIEBREAK_DENSE: for j in range(grp_start, i + 1): - grp_sizes[_as[j]] = (i - grp_start + 1 - grp_na_count) + grp_sizes[lexsort_indexer[j]] = \ + (i - grp_start + 1 - grp_na_count) else: for j in range(grp_start, i + 1): - grp_sizes[_as[j]] = (grp_tie_count - (grp_na_count > 0)) + grp_sizes[lexsort_indexer[j]] = \ + (grp_tie_count - (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 grp_tie_count = 0 diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 028b77702f154..ffb75401013dc 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1107,7 +1107,7 @@ def group_rank(float64_t[:, :] out, ndarray[float64_t, ndim=1] result result = rank_1d( - in_arr=values[:, 0], + values=values[:, 0], labels=labels, ties_method=ties_method, ascending=ascending,