From c7a91ac3fb89638285e2b524c849197c37375f6b Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 9 Jun 2021 16:00:40 -0400 Subject: [PATCH 01/25] REF: split out sorted_rank algo --- pandas/_libs/algos.pyx | 335 +++++++++++++++++++++++++---------------- 1 file changed, 203 insertions(+), 132 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index f2efeedb80d4d..362df3bf7e710 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -950,16 +950,15 @@ def rank_1d( """ cdef: TiebreakEnumType tiebreak - Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0 - ndarray[int64_t, ndim=1] grp_sizes - ndarray[intp_t, ndim=1] lexsort_indexer - ndarray[float64_t, ndim=1] out - ndarray[rank_t, ndim=1] masked_vals - ndarray[uint8_t, ndim=1] mask - bint keep_na, at_end, next_val_diff, check_labels, group_changed + Py_ssize_t N + int64_t[::1] grp_sizes + intp_t[:] lexsort_indexer + float64_t[::1] out + ndarray [rank_t, ndim=1] masked_vals + rank_t[:] masked_vals_memview + uint8_t[:] mask + bint keep_na, check_labels rank_t nan_fill_val - int64_t grp_size tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -978,6 +977,9 @@ def rank_1d( # comparisons check_labels = np.any(labels) + # For cases where a mask is not possible, we can avoid mask checks + check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) + # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array @@ -1021,9 +1023,11 @@ def rank_1d( else: nan_fill_val = -np.inf - order = (masked_vals, ~mask, labels) + order = (masked_vals, ~(np.array(mask, copy=False)), labels) np.putmask(masked_vals, mask, nan_fill_val) + # putmask doesn't accept a memoryview, so we assign as a separate step + masked_vals_memview = masked_vals # lexsort using labels, then mask, then actual values # each label corresponds to a different group value, @@ -1034,6 +1038,77 @@ def rank_1d( if not ascending: lexsort_indexer = lexsort_indexer[::-1] + with nogil: + rank_sorted_1d( + out, + grp_sizes, + labels, + lexsort_indexer, + masked_vals_memview, + mask, + tiebreak, + check_mask, + check_labels, + keep_na, + N, + ) + if pct: + for i in range(N): + if grp_sizes[i] != 0: + out[i] = out[i] / grp_sizes[i] + + return np.array(out) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void rank_sorted_1d( + float64_t[::1] out, + int64_t[::1] grp_sizes, + const intp_t[:] labels, + const intp_t[:] sort_indexer, + # Can make const with cython3 (https://github.com/cython/cython/issues/3222) + rank_t[:] masked_vals, + const uint8_t[:] mask, + TiebreakEnumType tiebreak, + bint check_mask, + bint check_labels, + bint keep_na, + Py_ssize_t N, +) nogil: + """ + See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should + be handled in the caller. Note that `out` and `grp_sizes` are modified inplace. + + Parameters + ---------- + out : float64_t[::1] + Array to store computed ranks + grp_sizes : int64_t[::1] + Array to store group counts. + labels : See rank_1d.__doc__ + sort_indexer : intp_t[:] + masked_vals : rank_t[:] + mask : uint8_t[:] + tiebreak : TiebreakEnumType + See rank_1d.__doc__ for the different modes + check_mask : bint + If False, assumes the mask is all False to skip mask indexing + check_labels : bint + If False, assumes all labels are the same to skip group handling logic + keep_na : bint + Whether or not to keep nulls + N : Py_ssize_t + The number of elements to rank. Note: it is not always true that + N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why) + """ + + cdef: + Py_ssize_t i, j, dups=0, sum_ranks=0, + Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0 + bint at_end, next_val_diff, group_changed + int64_t grp_size + # Loop over the length of the value array # each incremental i value can be looked up in the lexsort_indexer # array that we sorted previously, which gives us the location of @@ -1041,105 +1116,7 @@ def rank_1d( # values / masked_vals arrays # TODO: de-duplicate once cython supports conditional nogil if rank_t is object: - for i in range(N): - at_end = i == N - 1 - - # dups and sum_ranks will be incremented each loop where - # the value / group remains the same, and should be reset - # when either of those change. Used to calculate tiebreakers - dups += 1 - sum_ranks += i - grp_start + 1 - - next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], - masked_vals[lexsort_indexer[i+1]]) - - # We'll need this check later anyway to determine group size, so just - # compute it here since shortcircuiting won't help - group_changed = at_end or (check_labels and - (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) - - # Update out only when there is a transition of values or labels. - # When a new value or group is encountered, go back #dups steps( - # the number of occurrence of current value) and assign the ranks - # based on the starting index of the current group (grp_start) - # and the current index - if (next_val_diff or group_changed - or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): - - # If keep_na, check for missing values and assign back - # to the result where appropriate - if keep_na and mask[lexsort_indexer[i]]: - grp_na_count = dups - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = NaN - elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start + 1 - - # With n as the previous rank in the group and m as the number - # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, - # then rankings should be n + 1, n + 2 ... n + m - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = j + 1 - grp_start - - # If TIEBREAK_FIRST and descending, the ranking should be - # n + m, n + (m - 1) ... n + 1. This is equivalent to - # (i - dups + 1) + (i - j + 1) - grp_start - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = grp_vals_seen - - # Look forward to the next value (using the sorting in - # lexsort_indexer). If the value does not equal the current - # value then we need to reset the dups and sum_ranks, knowing - # that a new value is coming up. The conditional also needs - # to handle nan equality and the end of iteration. If group - # changes we do not record seeing a new value in the group - if not group_changed and (next_val_diff or - (mask[lexsort_indexer[i]] - ^ mask[lexsort_indexer[i+1]])): - dups = sum_ranks = 0 - grp_vals_seen += 1 - - # Similar to the previous conditional, check now if we are - # moving to a new group. If so, keep track of the index where - # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. Fill in the size of each - # group encountered (used by pct calculations later). Also be - # sure to reset any of the items helping to calculate dups - if group_changed: - - # If not dense tiebreak, group size used to compute - # percentile will be # of non-null elements in group - if tiebreak != TIEBREAK_DENSE: - grp_size = i - grp_start + 1 - grp_na_count - - # Otherwise, it will be the number of distinct values - # in the group, subtracting 1 if NaNs are present - # since that is a distinct value we shouldn't count - else: - grp_size = grp_vals_seen - (grp_na_count > 0) - - for j in range(grp_start, i + 1): - grp_sizes[lexsort_indexer[j]] = grp_size - - dups = sum_ranks = 0 - grp_na_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 - else: - with nogil: + with gil: for i in range(N): at_end = i == N - 1 @@ -1149,55 +1126,56 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] - != masked_vals[lexsort_indexer[i+1]]) + next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]], + masked_vals[sort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help group_changed = at_end or (check_labels and - (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) + (labels[sort_indexer[i]] + != labels[sort_indexer[i+1]])) # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if (next_val_diff or group_changed - or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): + if (next_val_diff or group_changed or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): # If keep_na, check for missing values and assign back # to the result where appropriate - if keep_na and mask[lexsort_indexer[i]]: + if keep_na and check_mask and mask[sort_indexer[i]]: grp_na_count = dups for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = NaN + out[sort_indexer[j]] = NaN elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = sum_ranks / dups + out[sort_indexer[j]] = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start - dups + 2 + out[sort_indexer[j]] = i - grp_start - dups + 2 elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start + 1 + out[sort_indexer[j]] = i - grp_start + 1 # With n as the previous rank in the group and m as the number # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, # then rankings should be n + 1, n + 2 ... n + m elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = j + 1 - grp_start + out[sort_indexer[j]] = j + 1 - grp_start # If TIEBREAK_FIRST and descending, the ranking should be # n + m, n + (m - 1) ... n + 1. This is equivalent to # (i - dups + 1) + (i - j + 1) - grp_start elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start + out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = grp_vals_seen + out[sort_indexer[j]] = grp_vals_seen # Look forward to the next value (using the sorting in # lexsort_indexer). If the value does not equal the current @@ -1205,9 +1183,9 @@ def rank_1d( # that a new value is coming up. The conditional also needs # to handle nan equality and the end of iteration. If group # changes we do not record seeing a new value in the group - if not group_changed and (next_val_diff or - (mask[lexsort_indexer[i]] - ^ mask[lexsort_indexer[i+1]])): + if not group_changed and (next_val_diff or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): dups = sum_ranks = 0 grp_vals_seen += 1 @@ -1231,19 +1209,112 @@ def rank_1d( grp_size = grp_vals_seen - (grp_na_count > 0) for j in range(grp_start, i + 1): - grp_sizes[lexsort_indexer[j]] = grp_size + grp_sizes[sort_indexer[j]] = grp_size dups = sum_ranks = 0 grp_na_count = 0 grp_start = i + 1 grp_vals_seen = 1 - - if pct: + else: for i in range(N): - if grp_sizes[i] != 0: - out[i] = out[i] / grp_sizes[i] + at_end = i == N - 1 + + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change. Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + next_val_diff = at_end or (masked_vals[sort_indexer[i]] + != masked_vals[sort_indexer[i+1]]) + + # We'll need this check later anyway to determine group size, so just + # compute it here since shortcircuiting won't help + group_changed = at_end or (check_labels and + (labels[sort_indexer[i]] + != labels[sort_indexer[i+1]])) + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the starting index of the current group (grp_start) + # and the current index + if (next_val_diff or group_changed + or (check_mask and + (mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))): + + # If keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and check_mask and mask[sort_indexer[i]]: + grp_na_count = dups + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = NaN + elif tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = i - grp_start + 1 + + # With n as the previous rank in the group and m as the number + # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, + # then rankings should be n + 1, n + 2 ... n + m + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = j + 1 - grp_start + + # If TIEBREAK_FIRST and descending, the ranking should be + # n + m, n + (m - 1) ... n + 1. This is equivalent to + # (i - dups + 1) + (i - j + 1) - grp_start + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = grp_vals_seen + + # Look forward to the next value (using the sorting in + # lexsort_indexer). If the value does not equal the current + # value then we need to reset the dups and sum_ranks, knowing + # that a new value is coming up. The conditional also needs + # to handle nan equality and the end of iteration. If group + # changes we do not record seeing a new value in the group + if not group_changed and (next_val_diff + or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): + dups = sum_ranks = 0 + grp_vals_seen += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. Fill in the size of each + # group encountered (used by pct calculations later). Also be + # sure to reset any of the items helping to calculate dups + if group_changed: - return out + # If not dense tiebreak, group size used to compute + # percentile will be # of non-null elements in group + if tiebreak != TIEBREAK_DENSE: + grp_size = i - grp_start + 1 - grp_na_count + + # Otherwise, it will be the number of distinct values + # in the group, subtracting 1 if NaNs are present + # since that is a distinct value we shouldn't count + else: + grp_size = grp_vals_seen - (grp_na_count > 0) + + for j in range(grp_start, i + 1): + grp_sizes[sort_indexer[j]] = grp_size + + dups = sum_ranks = 0 + grp_na_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 def rank_2d( From 4b0641ecfb96f49a80a221915e0a013b0d783b08 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 9 Jun 2021 16:26:41 -0400 Subject: [PATCH 02/25] Fixup docstring --- pandas/_libs/algos.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 362df3bf7e710..9ab12607e7789 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -954,10 +954,10 @@ def rank_1d( int64_t[::1] grp_sizes intp_t[:] lexsort_indexer float64_t[::1] out - ndarray [rank_t, ndim=1] masked_vals + ndarray[rank_t, ndim=1] masked_vals rank_t[:] masked_vals_memview uint8_t[:] mask - bint keep_na, check_labels + bint keep_na, check_labels, check_mask rank_t nan_fill_val tiebreak = tiebreakers[ties_method] @@ -1088,8 +1088,11 @@ cdef void rank_sorted_1d( Array to store group counts. labels : See rank_1d.__doc__ sort_indexer : intp_t[:] + Array of indices which sorts masked_vals masked_vals : rank_t[:] + The values input to rank_1d, with missing values replaced by fill values mask : uint8_t[:] + Array where entries are True if the value is missing, False otherwise tiebreak : TiebreakEnumType See rank_1d.__doc__ for the different modes check_mask : bint From b6dd4a6010d75af32725392c1e49e7bea352c0b4 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 9 Jun 2021 18:31:35 -0400 Subject: [PATCH 03/25] WIP --- pandas/_libs/algos.pyx | 172 +++++++++++--------------- pandas/_libs/algos_take_helper.pxi.in | 30 ----- 2 files changed, 73 insertions(+), 129 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 9ab12607e7789..fcdc0d1ccaf27 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1038,30 +1038,30 @@ def rank_1d( if not ascending: lexsort_indexer = lexsort_indexer[::-1] - with nogil: - rank_sorted_1d( - out, - grp_sizes, - labels, - lexsort_indexer, - masked_vals_memview, - mask, - tiebreak, - check_mask, - check_labels, - keep_na, - N, - ) - if pct: - for i in range(N): - if grp_sizes[i] != 0: - out[i] = out[i] / grp_sizes[i] + # with nogil: + rank_sorted_1d( + out, + grp_sizes, + labels, + lexsort_indexer, + masked_vals_memview, + mask, + tiebreak, + check_mask, + check_labels, + keep_na, + N, + ) + if pct: + for i in range(N): + if grp_sizes[i] != 0: + out[i] = out[i] / grp_sizes[i] return np.array(out) -@cython.wraparound(False) -@cython.boundscheck(False) +# @cython.wraparound(False) +# @cython.boundscheck(False) cdef void rank_sorted_1d( float64_t[::1] out, int64_t[::1] grp_sizes, @@ -1075,7 +1075,7 @@ cdef void rank_sorted_1d( bint check_labels, bint keep_na, Py_ssize_t N, -) nogil: +): """ See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should be handled in the caller. Note that `out` and `grp_sizes` are modified inplace. @@ -1119,7 +1119,7 @@ cdef void rank_sorted_1d( # values / masked_vals arrays # TODO: de-duplicate once cython supports conditional nogil if rank_t is object: - with gil: + # with gil: for i in range(N): at_end = i == N - 1 @@ -1220,6 +1220,7 @@ cdef void rank_sorted_1d( grp_vals_seen = 1 else: for i in range(N): + print(i) at_end = i == N - 1 # dups and sum_ranks will be incremented each loop where @@ -1227,15 +1228,18 @@ cdef void rank_sorted_1d( # when either of those change. Used to calculate tiebreakers dups += 1 sum_ranks += i - grp_start + 1 - + print(sort_indexer[i]) + print(sort_indexer[i+1]) next_val_diff = at_end or (masked_vals[sort_indexer[i]] != masked_vals[sort_indexer[i+1]]) + print("here") # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help group_changed = at_end or (check_labels and (labels[sort_indexer[i]] != labels[sort_indexer[i+1]])) + print("here") # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( @@ -1333,17 +1337,16 @@ def rank_2d( Fast NaN-friendly version of ``scipy.stats.rankdata``. """ cdef: - Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - Py_ssize_t infs - ndarray[float64_t, ndim=2] ranks + Py_ssize_t k, n, col + float64_t[::1, :] out # Column-major so columns are contiguous + int64_t[::1, :] grp_sizes + const intp_t[:] labels ndarray[rank_t, ndim=2] values - ndarray[intp_t, ndim=2] argsort_indexer - ndarray[uint8_t, ndim=2] mask - rank_t val, nan_value - float64_t count, sum_ranks = 0.0 - int tiebreak = 0 - int64_t idx - bint check_mask, condition, keep_na + rank_t[:, :] masked_vals_memview + intp_t[:, :] argsort_indexer + uint8_t[:, :] mask + TiebreakEnumType tiebreak + bint check_mask, keep_na tiebreak = tiebreakers[ties_method] @@ -1396,85 +1399,56 @@ def rank_2d( mask = np.zeros_like(values, dtype=bool) n, k = (values).shape - ranks = np.empty((n, k), dtype='f8') + out = np.empty((n, k), dtype='f8', order='F') + grp_sizes = np.ones((n, k), dtype='i8', order='F') + labels = np.ones(n, dtype=np.intp) if tiebreak == TIEBREAK_FIRST: # need to use a stable sort here - argsort_indexer = values.argsort(axis=1, kind='mergesort') + argsort_indexer = values.argsort(axis=1, kind='mergesort').astype( + np.intp, copy=False + ) if not ascending: tiebreak = TIEBREAK_FIRST_DESCENDING else: - argsort_indexer = values.argsort(1) + argsort_indexer = values.argsort(1).astype(np.intp, copy=False) if not ascending: argsort_indexer = argsort_indexer[:, ::-1] - values = _take_2d(values, argsort_indexer) - - for i in range(n): - dups = sum_ranks = infs = 0 - - total_tie_count = 0 - count = 0.0 - for j in range(k): - val = values[i, j] - idx = argsort_indexer[i, j] - if keep_na and check_mask and mask[i, idx]: - ranks[i, idx] = NaN - infs += 1 - continue - - count += 1.0 - - sum_ranks += (j - infs) + 1 - dups += 1 + masked_vals_memview = values + print(np.array(argsort_indexer)) + + print(k) + print(n) + print(values) + for col in range(k): + print("col" + str(col)) + # print(np.array(masked_vals_memview[:, col])) + + print(np.array(argsort_indexer[:, col])) + print(np.array(masked_vals_memview[:, col])) + # print(np.array(mask[:, col])) + # print(np.array(grp_sizes[:, col])) + # print(np.array(out[:, col])) + rank_sorted_1d( + out[:, col], + grp_sizes[:, col], + labels, + argsort_indexer[:, col], + masked_vals_memview[:, col], + mask[:, col], + tiebreak, + check_mask, + False, + keep_na, + n, + ) - if rank_t is object: - condition = ( - j == k - 1 or - are_diff(values[i, j + 1], val) or - (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]]) - ) - else: - condition = ( - j == k - 1 or - values[i, j + 1] != val or - (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]]) - ) - - if condition: - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for non-numeric data') - else: - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = z + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsort_indexer[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - if tiebreak == TIEBREAK_DENSE: - ranks[i, :] /= total_tie_count - else: - ranks[i, :] /= count if axis == 0: - return ranks.T + return np.array(out.T) else: - return ranks + return np.array(out) ctypedef fused diff_t: diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 11679fc432edc..1ad54216a1532 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -244,33 +244,3 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} {{endfor}} - -# ---------------------------------------------------------------------- -# take_2d internal function -# ---------------------------------------------------------------------- - -ctypedef fused take_t: - float64_t - uint64_t - int64_t - object - - -cdef _take_2d(ndarray[take_t, ndim=2] values, ndarray[intp_t, ndim=2] idx): - cdef: - Py_ssize_t i, j, N, K - ndarray[intp_t, ndim=2, cast=True] indexer = idx - ndarray[take_t, ndim=2] result - - N, K = (values).shape - - if take_t is object: - # evaluated at compile-time - result = values.copy() - else: - result = np.empty_like(values) - - for i in range(N): - for j in range(K): - result[i, j] = values[i, indexer[i, j]] - return result From 953b188cb429ea89c43442821e84b1a3d443a158 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 9 Jun 2021 20:36:07 -0400 Subject: [PATCH 04/25] WIP --- pandas/_libs/algos.pyx | 103 ++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 63 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index fcdc0d1ccaf27..b886721ceba12 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1050,12 +1050,9 @@ def rank_1d( check_mask, check_labels, keep_na, + pct, N, ) - if pct: - for i in range(N): - if grp_sizes[i] != 0: - out[i] = out[i] / grp_sizes[i] return np.array(out) @@ -1074,6 +1071,7 @@ cdef void rank_sorted_1d( bint check_mask, bint check_labels, bint keep_na, + bint pct, Py_ssize_t N, ): """ @@ -1220,7 +1218,6 @@ cdef void rank_sorted_1d( grp_vals_seen = 1 else: for i in range(N): - print(i) at_end = i == N - 1 # dups and sum_ranks will be incremented each loop where @@ -1228,18 +1225,14 @@ cdef void rank_sorted_1d( # when either of those change. Used to calculate tiebreakers dups += 1 sum_ranks += i - grp_start + 1 - print(sort_indexer[i]) - print(sort_indexer[i+1]) next_val_diff = at_end or (masked_vals[sort_indexer[i]] != masked_vals[sort_indexer[i+1]]) - print("here") # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help group_changed = at_end or (check_labels and (labels[sort_indexer[i]] != labels[sort_indexer[i+1]])) - print("here") # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( @@ -1323,6 +1316,11 @@ cdef void rank_sorted_1d( grp_start = i + 1 grp_vals_seen = 1 + if pct: + for i in range(N): + if grp_sizes[i] != 0: + out[i] = out[i] / grp_sizes[i] + def rank_2d( ndarray[rank_t, ndim=2] in_arr, @@ -1355,7 +1353,7 @@ def rank_2d( # For cases where a mask is not possible, we can avoid mask checks check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) - if axis == 0: + if axis == 1: values = np.asarray(in_arr).T.copy() else: values = np.asarray(in_arr).copy() @@ -1364,73 +1362,51 @@ def rank_2d( if values.dtype != np.object_: values = values.astype('O') - if check_mask: - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_value = Infinity() - elif rank_t is float64_t: - nan_value = np.inf - - # int64 and datetimelike - else: - nan_value = np.iinfo(np.int64).max + if rank_t is object: + mask = missing.isnaobj2d(values) + elif rank_t is int64_t and is_datetimelike: + mask = (values == NPY_NAT).astype(np.uint8) + elif rank_t is float64_t: + mask = np.isnan(values).astype(np.uint8) + else: + mask = np.zeros_like(values, dtype=np.uint8) + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_fill_val = Infinity() + elif rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).max + elif rank_t is uint64_t: + nan_fill_val = np.iinfo(np.uint64).max else: - if rank_t is object: - nan_value = NegInfinity() - elif rank_t is float64_t: - nan_value = -np.inf - - # int64 and datetimelike - else: - nan_value = NPY_NAT + nan_fill_val = np.inf + order = (values, mask) + else: if rank_t is object: - mask = missing.isnaobj2d(values) - elif rank_t is float64_t: - mask = np.isnan(values) - - # int64 and datetimelike + nan_fill_val = NegInfinity() + elif rank_t is int64_t: + nan_fill_val = NPY_NAT + elif rank_t is uint64_t: + nan_fill_val = 0 else: - mask = values == NPY_NAT + nan_fill_val = -np.inf - np.putmask(values, mask, nan_value) - else: - mask = np.zeros_like(values, dtype=bool) + order = (values, ~np.array(mask)) + + np.putmask(values, mask, nan_fill_val) n, k = (values).shape out = np.empty((n, k), dtype='f8', order='F') grp_sizes = np.ones((n, k), dtype='i8', order='F') - labels = np.ones(n, dtype=np.intp) - - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - argsort_indexer = values.argsort(axis=1, kind='mergesort').astype( - np.intp, copy=False - ) - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - argsort_indexer = values.argsort(1).astype(np.intp, copy=False) + labels = np.zeros(n, dtype=np.intp) + argsort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False) if not ascending: - argsort_indexer = argsort_indexer[:, ::-1] + argsort_indexer = argsort_indexer[::-1, :] masked_vals_memview = values - print(np.array(argsort_indexer)) - - print(k) - print(n) - print(values) for col in range(k): - print("col" + str(col)) - # print(np.array(masked_vals_memview[:, col])) - - print(np.array(argsort_indexer[:, col])) - print(np.array(masked_vals_memview[:, col])) - # print(np.array(mask[:, col])) - # print(np.array(grp_sizes[:, col])) - # print(np.array(out[:, col])) rank_sorted_1d( out[:, col], grp_sizes[:, col], @@ -1442,10 +1418,11 @@ def rank_2d( check_mask, False, keep_na, + pct, n, ) - if axis == 0: + if axis == 1: return np.array(out.T) else: return np.array(out) From 254b9974025b7f8ff5851911c3cb078d953995d2 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 9 Jun 2021 20:38:30 -0400 Subject: [PATCH 05/25] premerge --- pandas/_libs/algos.pyx | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index b886721ceba12..61b5a6140e7d4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1038,27 +1038,27 @@ def rank_1d( if not ascending: lexsort_indexer = lexsort_indexer[::-1] - # with nogil: - rank_sorted_1d( - out, - grp_sizes, - labels, - lexsort_indexer, - masked_vals_memview, - mask, - tiebreak, - check_mask, - check_labels, - keep_na, - pct, - N, - ) + with nogil: + rank_sorted_1d( + out, + grp_sizes, + labels, + lexsort_indexer, + masked_vals_memview, + mask, + tiebreak, + check_mask, + check_labels, + keep_na, + pct, + N, + ) return np.array(out) -# @cython.wraparound(False) -# @cython.boundscheck(False) +@cython.wraparound(False) +@cython.boundscheck(False) cdef void rank_sorted_1d( float64_t[::1] out, int64_t[::1] grp_sizes, @@ -1117,7 +1117,7 @@ cdef void rank_sorted_1d( # values / masked_vals arrays # TODO: de-duplicate once cython supports conditional nogil if rank_t is object: - # with gil: + with gil: for i in range(N): at_end = i == N - 1 From 29dc59090e865ad69d7c5c1e647895916b315bdb Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 9 Jun 2021 21:25:24 -0400 Subject: [PATCH 06/25] REF: give ranks same nan filling --- pandas/_libs/algos.pyx | 100 +++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 03f4ce273de6e..4fd515113316c 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -931,6 +931,32 @@ ctypedef fused rank_t: int64_t +cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, ndarray[rank_t, ndim=1] _): + """ + Return the value we'll use to represent missing values when sorting depending + on if we'd like missing values to end up at the top/bottom. (The second parameter + is unused, but needed for fused type specialization) + """ + if rank_nans_highest: + if rank_t is object: + return Infinity() + elif rank_t is int64_t: + return np.iinfo(np.int64).max + elif rank_t is uint64_t: + return np.iinfo(np.uint64).max + else: + return np.inf + else: + if rank_t is object: + return NegInfinity() + elif rank_t is int64_t: + return NPY_NAT + elif rank_t is uint64_t: + return 0 + else: + return -np.inf + + @cython.wraparound(False) @cython.boundscheck(False) def rank_1d( @@ -980,7 +1006,7 @@ def rank_1d( ndarray[rank_t, ndim=1] masked_vals rank_t[:] masked_vals_memview uint8_t[:] mask - bint keep_na, check_labels, check_mask + bint keep_na, nans_rank_highest, check_labels, check_mask rank_t nan_fill_val tiebreak = tiebreakers[ties_method] @@ -1026,26 +1052,11 @@ def rank_1d( # If descending, fill with highest value since descending # will flip the ordering to still end up with lowest rank. # Symmetric logic applies to `na_option == 'bottom'` - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_fill_val = Infinity() - elif rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).max - elif rank_t is uint64_t: - nan_fill_val = np.iinfo(np.uint64).max - else: - nan_fill_val = np.inf + nans_rank_highest = ascending ^ (na_option == 'top') + nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, masked_vals) + if nans_rank_highest: order = (masked_vals, mask, labels) else: - if rank_t is object: - nan_fill_val = NegInfinity() - elif rank_t is int64_t: - nan_fill_val = NPY_NAT - elif rank_t is uint64_t: - nan_fill_val = 0 - else: - nan_fill_val = -np.inf - order = (masked_vals, ~(np.array(mask, copy=False)), labels) np.putmask(masked_vals, mask, nan_fill_val) @@ -1073,12 +1084,9 @@ def rank_1d( check_mask, check_labels, keep_na, + pct, N, ) - if pct: - for i in range(N): - if grp_sizes[i] != 0: - out[i] = out[i] / grp_sizes[i] return np.array(out) @@ -1097,6 +1105,7 @@ cdef void rank_sorted_1d( bint check_mask, bint check_labels, bint keep_na, + bint pct, Py_ssize_t N, ) nogil: """ @@ -1108,7 +1117,7 @@ cdef void rank_sorted_1d( out : float64_t[::1] Array to store computed ranks grp_sizes : int64_t[::1] - Array to store group counts. + Array to store group counts, only used if pct=True labels : See rank_1d.__doc__ sort_indexer : intp_t[:] Array of indices which sorts masked_vals @@ -1118,12 +1127,14 @@ cdef void rank_sorted_1d( Array where entries are True if the value is missing, False otherwise tiebreak : TiebreakEnumType See rank_1d.__doc__ for the different modes - check_mask : bint + check_mask : bool If False, assumes the mask is all False to skip mask indexing - check_labels : bint + check_labels : bool If False, assumes all labels are the same to skip group handling logic - keep_na : bint + keep_na : bool Whether or not to keep nulls + pct : bool + Compute percentage rank of data within each group N : Py_ssize_t The number of elements to rank. Note: it is not always true that N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why) @@ -1342,6 +1353,11 @@ cdef void rank_sorted_1d( grp_start = i + 1 grp_vals_seen = 1 + if pct: + for i in range(N): + if grp_sizes[i] != 0: + out[i] = out[i] / grp_sizes[i] + def rank_2d( ndarray[rank_t, ndim=2] in_arr, @@ -1360,13 +1376,14 @@ def rank_2d( Py_ssize_t infs ndarray[float64_t, ndim=2] ranks ndarray[rank_t, ndim=2] values + ndarray[rank_t, ndim=1] unused ndarray[intp_t, ndim=2] argsort_indexer ndarray[uint8_t, ndim=2] mask - rank_t val, nan_value + rank_t val, nan_fill_val float64_t count, sum_ranks = 0.0 int tiebreak = 0 int64_t idx - bint check_mask, condition, keep_na + bint check_mask, condition, keep_na, nans_rank_highest tiebreak = tiebreakers[ties_method] @@ -1384,26 +1401,11 @@ def rank_2d( if values.dtype != np.object_: values = values.astype('O') + nans_rank_highest = ascending ^ (na_option == 'top') if check_mask: - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_value = Infinity() - elif rank_t is float64_t: - nan_value = np.inf - - # int64 and datetimelike - else: - nan_value = np.iinfo(np.int64).max - - else: - if rank_t is object: - nan_value = NegInfinity() - elif rank_t is float64_t: - nan_value = -np.inf - - # int64 and datetimelike - else: - nan_value = NPY_NAT + # For fused type specialization + unused = values[:, 0] + nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused) if rank_t is object: mask = missing.isnaobj2d(values) @@ -1414,7 +1416,7 @@ def rank_2d( else: mask = values == NPY_NAT - np.putmask(values, mask, nan_value) + np.putmask(values, mask, nan_fill_val) else: mask = np.zeros_like(values, dtype=bool) From 974650d76e1eca3d8dcb1a5e646e0b615dfdc6dc Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 9 Jun 2021 21:46:07 -0400 Subject: [PATCH 07/25] WIP --- pandas/_libs/algos.pyx | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index bf5667b93f455..86690b44b1133 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1382,6 +1382,7 @@ def rank_2d( uint8_t[:, :] mask TiebreakEnumType tiebreak bint check_mask, keep_na, nans_rank_highest + rank_t nan_fill_val tiebreak = tiebreakers[ties_method] @@ -1404,6 +1405,15 @@ def rank_2d( # For fused type specialization unused = values[:, 0] nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused) + + if rank_t is object: + mask = missing.isnaobj2d(values).astype(np.uint8) + elif rank_t is float64_t: + mask = np.isnan(values).astype(np.uint8) + + # int64 and datetimelike + else: + mask = (values == NPY_NAT).astype(np.uint8) np.putmask(values, mask, nan_fill_val) else: mask = np.zeros_like(values, dtype=np.uint8) From b840b74040de4390ccd86cdee7e536b297f6a590 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 9 Jun 2021 21:58:55 -0400 Subject: [PATCH 08/25] Handle empty case early --- pandas/_libs/algos.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 4fd515113316c..c55d1e9898b79 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1385,6 +1385,9 @@ def rank_2d( int64_t idx bint check_mask, condition, keep_na, nans_rank_highest + if in_arr.shape[0] == 0 or in_arr.shape[1] == 0: + return np.empty_like(in_arr, dtype="f8") + tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' From f099bb0d0fd18440f1210adf13762f14aa3259e2 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 9 Jun 2021 22:02:06 -0400 Subject: [PATCH 09/25] Handle empty case early --- pandas/_libs/algos.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 86690b44b1133..a3c1629209b9f 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1384,6 +1384,9 @@ def rank_2d( bint check_mask, keep_na, nans_rank_highest rank_t nan_fill_val + if in_arr.shape[0] == 0 or in_arr.shape[1] == 0: + return np.empty_like(in_arr, dtype="f8") + tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' From 4aa4f8bbce26b4706381eeeb64851462d3718067 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 9 Jun 2021 23:15:17 -0400 Subject: [PATCH 10/25] WIP --- pandas/_libs/algos.pyx | 36 ++++++++++++++++--------- pandas/tests/frame/methods/test_rank.py | 30 ++++++++++++--------- 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a3c1629209b9f..3a33206384fa8 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1377,7 +1377,7 @@ def rank_2d( const intp_t[:] labels ndarray[rank_t, ndim=2] values ndarray[rank_t, ndim=1] unused - rank_t[:, :] masked_vals_memview + rank_t[:, :] masked_vals intp_t[:, :] argsort_indexer uint8_t[:, :] mask TiebreakEnumType tiebreak @@ -1388,6 +1388,9 @@ def rank_2d( return np.empty_like(in_arr, dtype="f8") tiebreak = tiebreakers[ties_method] + if tiebreak == TIEBREAK_FIRST: + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING keep_na = na_option == 'keep' @@ -1403,20 +1406,29 @@ def rank_2d( if values.dtype != np.object_: values = values.astype('O') + if rank_t is object: + mask = missing.isnaobj2d(values) + elif rank_t is int64_t and is_datetimelike: + mask = (values == NPY_NAT).astype(np.uint8) + elif rank_t is float64_t: + mask = np.isnan(values).astype(np.uint8) + else: + mask = np.zeros_like(values, dtype=np.uint8) + nans_rank_highest = ascending ^ (na_option == 'top') if check_mask: # For fused type specialization unused = values[:, 0] nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused) - if rank_t is object: - mask = missing.isnaobj2d(values).astype(np.uint8) - elif rank_t is float64_t: - mask = np.isnan(values).astype(np.uint8) - - # int64 and datetimelike - else: - mask = (values == NPY_NAT).astype(np.uint8) + # if rank_t is object: + # mask = missing.isnaobj2d(values).view(np.uint8) + # elif rank_t is float64_t: + # mask = np.isnan(values).view(np.uint8) + # + # # int64 and datetimelike + # else: + # mask = (values == NPY_NAT).view(np.uint8) np.putmask(values, mask, nan_fill_val) else: mask = np.zeros_like(values, dtype=np.uint8) @@ -1424,7 +1436,7 @@ def rank_2d( if nans_rank_highest: order = (values, mask) else: - order = (values, ~np.array(mask)) + order = (values, ~np.array(mask, copy=False)) n, k = (values).shape out = np.empty((n, k), dtype='f8', order='F') @@ -1436,14 +1448,14 @@ def rank_2d( argsort_indexer = argsort_indexer[::-1, :] # putmask doesn't accept a memoryview, so we assign as a separate step - masked_vals_memview = values + masked_vals = values for col in range(k): rank_sorted_1d( out[:, col], grp_sizes[:, col], labels, argsort_indexer[:, col], - masked_vals_memview[:, col], + masked_vals[:, col], mask[:, col], tiebreak, check_mask, diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 5ba4ab4408f11..36dd6226866ca 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -246,13 +246,12 @@ def test_rank_methods_frame(self): expected = DataFrame(sprank, columns=cols).astype("float64") tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) + @pytest.mark.parametrize("rank_method", ["average", "min", "max", "dense"]) @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") - def test_rank_descending(self, method, dtype): - + def test_rank_descending(self, rank_method, dtype): if "i" in dtype: - df = self.df.dropna() + df = self.df.dropna().astype(dtype) else: df = self.df.astype(dtype) @@ -260,18 +259,26 @@ def test_rank_descending(self, method, dtype): expected = (df.max() - df).rank() tm.assert_frame_equal(res, expected) - if method == "first" and dtype == "O": - return - - expected = (df.max() - df).rank(method=method) + expected = (df.max() - df).rank(method=rank_method) if dtype != "O": - res2 = df.rank(method=method, ascending=False, numeric_only=True) + res2 = df.rank(method=rank_method, ascending=False, numeric_only=True) tm.assert_frame_equal(res2, expected) - res3 = df.rank(method=method, ascending=False, numeric_only=False) + res3 = df.rank(method=rank_method, ascending=False, numeric_only=False) tm.assert_frame_equal(res3, expected) + @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) + @pytest.mark.parametrize("ascending", [True, False]) + def test_rank_first_ties(self, dtype, ascending, frame_or_series): + obj = frame_or_series([1, 1], dtype=dtype) + result = obj.rank(method="first", ascending=ascending) + expected_data = [1, 2] + if ascending: + expected_data = expected_data[::-1] + expected = frame_or_series(expected_data, dtype=np.float64) + tm.assert_equal(result, expected) + @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("dtype", [None, object]) def test_rank_2d_tie_methods(self, method, axis, dtype): @@ -287,9 +294,6 @@ def _check2d(df, expected, method="average", axis=0): result = df.rank(method=method, axis=axis) tm.assert_frame_equal(result, exp_df) - disabled = {(object, "first")} - if (dtype, method) in disabled: - return frame = df if dtype is None else df.astype(dtype) _check2d(frame, self.results[method], method=method, axis=axis) From c5ed688143a8ff9b278e473bad46b2140fb812bf Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 9 Jun 2021 23:34:49 -0400 Subject: [PATCH 11/25] WIP --- pandas/tests/frame/methods/test_rank.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 36dd6226866ca..46988350c2367 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -247,9 +247,8 @@ def test_rank_methods_frame(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) - @pytest.mark.parametrize("rank_method", ["average", "min", "max", "dense"]) @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") - def test_rank_descending(self, rank_method, dtype): + def test_rank_descending(self, method, dtype): if "i" in dtype: df = self.df.dropna().astype(dtype) else: @@ -259,26 +258,15 @@ def test_rank_descending(self, rank_method, dtype): expected = (df.max() - df).rank() tm.assert_frame_equal(res, expected) - expected = (df.max() - df).rank(method=rank_method) + expected = (df.max() - df).rank(method=method) if dtype != "O": - res2 = df.rank(method=rank_method, ascending=False, numeric_only=True) + res2 = df.rank(method=method, ascending=False, numeric_only=True) tm.assert_frame_equal(res2, expected) - res3 = df.rank(method=rank_method, ascending=False, numeric_only=False) + res3 = df.rank(method=method, ascending=False, numeric_only=False) tm.assert_frame_equal(res3, expected) - @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) - @pytest.mark.parametrize("ascending", [True, False]) - def test_rank_first_ties(self, dtype, ascending, frame_or_series): - obj = frame_or_series([1, 1], dtype=dtype) - result = obj.rank(method="first", ascending=ascending) - expected_data = [1, 2] - if ascending: - expected_data = expected_data[::-1] - expected = frame_or_series(expected_data, dtype=np.float64) - tm.assert_equal(result, expected) - @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("dtype", [None, object]) def test_rank_2d_tie_methods(self, method, axis, dtype): From 7a04159b64d2a09e461f8b66fa1048375faf64b8 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 10 Jun 2021 12:57:57 -0400 Subject: [PATCH 12/25] Add object first test --- pandas/tests/frame/methods/test_rank.py | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 46988350c2367..6c5831ad897d1 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -448,6 +448,38 @@ def test_rank_both_inf(self): result = df.rank() tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "na_option,ascending,expected", + [ + ("top", True, [3.0, 1.0, 2.0]), + ("top", False, [2.0, 1.0, 3.0]), + ("bottom", True, [2.0, 3.0, 1.0]), + ("bottom", False, [1.0, 3.0, 2.0]), + ], + ) + def test_rank_inf_nans_na_option( + self, frame_or_series, method, na_option, ascending, expected + ): + obj = frame_or_series([np.inf, np.nan, -np.inf]) + result = obj.rank(method=method, na_option=na_option, ascending=ascending) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "na_option,ascending,expected", + [ + ("bottom", True, [1.0, 2.0, 4.0, 3.0]), + ("bottom", False, [1.0, 2.0, 4.0, 3.0]), + ("top", True, [2.0, 3.0, 1.0, 4.0]), + ("top", False, [2.0, 3.0, 1.0, 4.0]), + ], + ) + def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): + obj = frame_or_series(["foo", "foo", None, "foo"]) + result = obj.rank(method="first", na_option=na_option, ascending=ascending) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + @pytest.mark.parametrize( "data,expected", [ From ab9989e69462b5cd7d8c6e2a8c52c832c83d3c3c Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 10 Jun 2021 13:00:51 -0400 Subject: [PATCH 13/25] Add back nogil --- pandas/_libs/algos.pyx | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 3a33206384fa8..b1d05ee7ca677 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1449,21 +1449,22 @@ def rank_2d( # putmask doesn't accept a memoryview, so we assign as a separate step masked_vals = values - for col in range(k): - rank_sorted_1d( - out[:, col], - grp_sizes[:, col], - labels, - argsort_indexer[:, col], - masked_vals[:, col], - mask[:, col], - tiebreak, - check_mask, - False, - keep_na, - pct, - n, - ) + with nogil: + for col in range(k): + rank_sorted_1d( + out[:, col], + grp_sizes[:, col], + labels, + argsort_indexer[:, col], + masked_vals[:, col], + mask[:, col], + tiebreak, + check_mask, + False, + keep_na, + pct, + n, + ) if axis == 1: return np.array(out.T) From 5ba6459ec6490548b7444c845dcea8d1fde6190f Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 10 Jun 2021 13:33:02 -0400 Subject: [PATCH 14/25] Add whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 ++ pandas/_libs/algos.pyx | 37 +++++++++++++++------------------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ff1c6ebf7aae2..674d6287be9ea 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -911,6 +911,8 @@ Numeric - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`) - Bug in :meth:`DataFrame.rank` when the DataFrame contained ``np.inf`` (:issue:`32593`) - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising an ``IndexError`` (:issue:`38932`) +- Bug in :meth:`DataFrame.rank` raising ... with ``object`` columns and ``method="first"`` (:issue:`...`) +- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`...`) - Bug in :meth:`Series.rank`, :meth:`DataFrame.rank`, and :meth:`.GroupBy.rank` treating the most negative ``int64`` value as missing (:issue:`32859`) - Bug in :meth:`DataFrame.select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36596`) - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed the argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index b1d05ee7ca677..77e068406cf45 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1378,7 +1378,7 @@ def rank_2d( ndarray[rank_t, ndim=2] values ndarray[rank_t, ndim=1] unused rank_t[:, :] masked_vals - intp_t[:, :] argsort_indexer + intp_t[:, :] sort_indexer uint8_t[:, :] mask TiebreakEnumType tiebreak bint check_mask, keep_na, nans_rank_highest @@ -1406,29 +1406,20 @@ def rank_2d( if values.dtype != np.object_: values = values.astype('O') - if rank_t is object: - mask = missing.isnaobj2d(values) - elif rank_t is int64_t and is_datetimelike: - mask = (values == NPY_NAT).astype(np.uint8) - elif rank_t is float64_t: - mask = np.isnan(values).astype(np.uint8) - else: - mask = np.zeros_like(values, dtype=np.uint8) - nans_rank_highest = ascending ^ (na_option == 'top') if check_mask: # For fused type specialization unused = values[:, 0] nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused) - # if rank_t is object: - # mask = missing.isnaobj2d(values).view(np.uint8) - # elif rank_t is float64_t: - # mask = np.isnan(values).view(np.uint8) - # - # # int64 and datetimelike - # else: - # mask = (values == NPY_NAT).view(np.uint8) + if rank_t is object: + mask = missing.isnaobj2d(values).view(np.uint8) + elif rank_t is float64_t: + mask = np.isnan(values).view(np.uint8) + + # int64 and datetimelike + else: + mask = (values == NPY_NAT).view(np.uint8) np.putmask(values, mask, nan_fill_val) else: mask = np.zeros_like(values, dtype=np.uint8) @@ -1443,9 +1434,13 @@ def rank_2d( grp_sizes = np.ones((n, k), dtype='i8', order='F') labels = np.zeros(n, dtype=np.intp) - argsort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False) + if check_mask and not keep_na: + sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False) + else: + sort_indexer = values.argsort(axis=0).astype(np.intp, copy=False) + if not ascending: - argsort_indexer = argsort_indexer[::-1, :] + sort_indexer = sort_indexer[::-1, :] # putmask doesn't accept a memoryview, so we assign as a separate step masked_vals = values @@ -1455,7 +1450,7 @@ def rank_2d( out[:, col], grp_sizes[:, col], labels, - argsort_indexer[:, col], + sort_indexer[:, col], masked_vals[:, col], mask[:, col], tiebreak, From 61540043a269e82841264e2a4524cc1b1b9c5579 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 10 Jun 2021 14:32:49 -0400 Subject: [PATCH 15/25] Cleaner fused type handling --- pandas/_libs/algos.pyx | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index c55d1e9898b79..7e6521deac052 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -931,7 +931,7 @@ ctypedef fused rank_t: int64_t -cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, ndarray[rank_t, ndim=1] _): +cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None): """ Return the value we'll use to represent missing values when sorting depending on if we'd like missing values to end up at the top/bottom. (The second parameter @@ -1053,7 +1053,7 @@ def rank_1d( # will flip the ordering to still end up with lowest rank. # Symmetric logic applies to `na_option == 'bottom'` nans_rank_highest = ascending ^ (na_option == 'top') - nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, masked_vals) + nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) if nans_rank_highest: order = (masked_vals, mask, labels) else: @@ -1376,7 +1376,6 @@ def rank_2d( Py_ssize_t infs ndarray[float64_t, ndim=2] ranks ndarray[rank_t, ndim=2] values - ndarray[rank_t, ndim=1] unused ndarray[intp_t, ndim=2] argsort_indexer ndarray[uint8_t, ndim=2] mask rank_t val, nan_fill_val @@ -1385,9 +1384,6 @@ def rank_2d( int64_t idx bint check_mask, condition, keep_na, nans_rank_highest - if in_arr.shape[0] == 0 or in_arr.shape[1] == 0: - return np.empty_like(in_arr, dtype="f8") - tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' @@ -1406,9 +1402,7 @@ def rank_2d( nans_rank_highest = ascending ^ (na_option == 'top') if check_mask: - # For fused type specialization - unused = values[:, 0] - nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused) + nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) if rank_t is object: mask = missing.isnaobj2d(values) From 0f8744da80722984c99cd1d5d1ec9ebbcab74b73 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 10 Jun 2021 15:03:40 -0400 Subject: [PATCH 16/25] Add comment --- pandas/_libs/algos.pyx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 193d99108b74e..ed6ba52e4aede 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1428,12 +1428,17 @@ def rank_2d( grp_sizes = np.ones((n, k), dtype='i8', order='F') labels = np.zeros(n, dtype=np.intp) - sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False) + # lexsort is slower, so only use if we need to worry about the mask + if check_mask: + sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False) + else: + kind = "stable" if ties_method == "first" else None + sort_indexer = values.argsort(axis=0, kind=kind) if not ascending: sort_indexer = sort_indexer[::-1, :] - # putmask doesn't accept a memoryview, so we assign as a separate step + # putmask doesn't accept a memoryview, so we assign in a separate step masked_vals = values with nogil: for col in range(k): From d47f2a6abb80d95affa3b885df7609a51bc8f8ea Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 10 Jun 2021 15:18:58 -0400 Subject: [PATCH 17/25] Update whatsnew --- doc/source/whatsnew/v1.3.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 674d6287be9ea..54411724a6709 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -911,8 +911,8 @@ Numeric - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`) - Bug in :meth:`DataFrame.rank` when the DataFrame contained ``np.inf`` (:issue:`32593`) - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising an ``IndexError`` (:issue:`38932`) -- Bug in :meth:`DataFrame.rank` raising ... with ``object`` columns and ``method="first"`` (:issue:`...`) -- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`...`) +- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`) +- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`) - Bug in :meth:`Series.rank`, :meth:`DataFrame.rank`, and :meth:`.GroupBy.rank` treating the most negative ``int64`` value as missing (:issue:`32859`) - Bug in :meth:`DataFrame.select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36596`) - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed the argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) From da61fb8dee12953f8714a4b164cd95a10aaac47e Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 10 Jun 2021 17:03:27 -0400 Subject: [PATCH 18/25] Try 32-bit fix --- pandas/_libs/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index ed6ba52e4aede..ca57d7d686925 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1433,7 +1433,7 @@ def rank_2d( sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False) else: kind = "stable" if ties_method == "first" else None - sort_indexer = values.argsort(axis=0, kind=kind) + sort_indexer = values.argsort(axis=0, kind=kind).astype(np.intp, copy=False) if not ascending: sort_indexer = sort_indexer[::-1, :] From e2d96179fcbe2cf02ab77fa19860eaebe40d1dab Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 10 Jun 2021 19:09:27 -0400 Subject: [PATCH 19/25] Debug 32-bit --- pandas/_libs/algos.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index ca57d7d686925..8a3b5ac77ced8 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1091,8 +1091,6 @@ def rank_1d( return np.array(out) -@cython.wraparound(False) -@cython.boundscheck(False) cdef void rank_sorted_1d( float64_t[::1] out, int64_t[::1] grp_sizes, From b4d11a410065bd0f9b903637cab8d955770a9c48 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 10 Jun 2021 20:11:28 -0400 Subject: [PATCH 20/25] Debug 32-bit --- pandas/_libs/algos.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 8a3b5ac77ced8..ca57d7d686925 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1091,6 +1091,8 @@ def rank_1d( return np.array(out) +@cython.wraparound(False) +@cython.boundscheck(False) cdef void rank_sorted_1d( float64_t[::1] out, int64_t[::1] grp_sizes, From 1e47daec7ed59dabb78006b3eef8251d8d47aa03 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 17 Jun 2021 10:46:43 -0700 Subject: [PATCH 21/25] Move whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 -- doc/source/whatsnew/v1.4.0.rst | 3 ++- pandas/_libs/algos.pyx | 5 +++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7d71de9a4f261..6c2fef3808566 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -941,8 +941,6 @@ Numeric - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`) - Bug in :meth:`DataFrame.rank` when the DataFrame contained ``np.inf`` (:issue:`32593`) - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising an ``IndexError`` (:issue:`38932`) -- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`) -- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`) - Bug in :meth:`Series.rank`, :meth:`DataFrame.rank`, and :meth:`.GroupBy.rank` treating the most negative ``int64`` value as missing (:issue:`32859`) - Bug in :meth:`DataFrame.select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36596`) - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed the argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 166ea2f0d4164..d748fcff14c61 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -137,7 +137,8 @@ Timezones Numeric ^^^^^^^ -- +- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`) +- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`) - Conversion diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d776111f177bc..a026cbe447c19 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1261,6 +1261,7 @@ cdef void rank_sorted_1d( # when either of those change. Used to calculate tiebreakers dups += 1 sum_ranks += i - grp_start + 1 + next_val_diff = at_end or (masked_vals[sort_indexer[i]] != masked_vals[sort_indexer[i+1]]) @@ -1458,9 +1459,9 @@ def rank_2d( ) if axis == 1: - return np.array(out.T) + return np.asarray(out.T) else: - return np.array(out) + return np.asarray(out) ctypedef fused diff_t: From 8d038af23692e1bc9543b811ae0d0753153f2e00 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 17 Jun 2021 16:07:30 -0700 Subject: [PATCH 22/25] WIP --- pandas/_libs/algos.pyi | 2 +- pandas/_libs/algos.pyx | 243 ++++++++++++++++++++----------------- pandas/core/algorithms.py | 1 - pandas/tests/test_algos.py | 2 +- 4 files changed, 135 insertions(+), 113 deletions(-) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index d0f664c323a89..c398d8d45c5b8 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -123,7 +123,7 @@ def is_monotonic( def rank_1d( values: np.ndarray, # ndarray[rank_t, ndim=1] - labels: np.ndarray, # const int64_t[:] + labels: np.ndarray | None, # const int64_t[:]=None is_datetimelike: bool = ..., ties_method=..., ascending: bool = ..., diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a026cbe447c19..6f77ae214e0d2 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -389,11 +389,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr int64_t nobs = 0 bint no_nans float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor - const int64_t[:] labels_n, labels_nobs N, K = (mat).shape - # For compatibility when calling rank_1d - labels_n = np.zeros(N, dtype=np.int64) # Handle the edge case where we know all results will be nan # to keep conditional logic inside loop simpler @@ -412,7 +409,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr maskedx = np.empty(N, dtype=np.float64) maskedy = np.empty(N, dtype=np.float64) for i in range(K): - ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n) + ranked_mat[:, i] = rank_1d(mat[:, i]) with nogil: for xi in range(K): @@ -451,11 +448,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr with gil: # We need to slice back to nobs because rank_1d will # require arrays of nobs length - labels_nobs = np.zeros(nobs, dtype=np.int64) - rankedx = rank_1d(np.array(maskedx)[:nobs], - labels=labels_nobs) - rankedy = rank_1d(np.array(maskedy)[:nobs], - labels=labels_nobs) + rankedx = rank_1d(np.array(maskedx)[:nobs]) + rankedy = rank_1d(np.array(maskedy)[:nobs]) for i in range(nobs): maskedx[i] = rankedx[i] maskedy[i] = rankedy[i] @@ -518,7 +512,6 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra int64_t total_discordant = 0 float64_t kendall_tau int64_t n_obs - const intp_t[:] labels_n N, K = (mat).shape @@ -526,11 +519,9 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra mask = np.isfinite(mat) ranked_mat = np.empty((N, K), dtype=np.float64) - # For compatibility when calling rank_1d - labels_n = np.zeros(N, dtype=np.intp) for i in range(K): - ranked_mat[:, i] = rank_1d(mat[:, i], labels_n) + ranked_mat[:, i] = rank_1d(mat[:, i]) for xi in range(K): sorted_idxs = ranked_mat[:, xi].argsort() @@ -961,7 +952,7 @@ cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None): @cython.boundscheck(False) def rank_1d( ndarray[rank_t, ndim=1] values, - const intp_t[:] labels, + const intp_t[:] labels=None, bint is_datetimelike=False, ties_method="average", bint ascending=True, @@ -974,10 +965,10 @@ def rank_1d( Parameters ---------- values : array of rank_t values to be ranked - labels : np.ndarray[np.intp] + labels : np.ndarray[np.intp] or None Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. If not called - from a groupby operation, will be an array of 0's + from a groupby operation, will be None. is_datetimelike : bool, default False True if `values` contains datetime-like entries. ties_method : {'average', 'min', 'max', 'first', 'dense'}, default @@ -1000,12 +991,12 @@ def rank_1d( cdef: TiebreakEnumType tiebreak Py_ssize_t N - int64_t[::1] grp_sizes - intp_t[:] lexsort_indexer + intp_t[:] sort_indexer float64_t[::1] out ndarray[rank_t, ndim=1] masked_vals rank_t[:] masked_vals_memview - uint8_t[:] mask + int64_t[::1] grp_sizes=None + uint8_t[:] mask=None bint keep_na, nans_rank_highest, check_labels, check_mask rank_t nan_fill_val @@ -1017,17 +1008,18 @@ def rank_1d( keep_na = na_option == 'keep' N = len(values) - # TODO Cython 3.0: cast won't be necessary (#2992) - assert len(labels) == N + if labels is not None: + # TODO Cython 3.0: cast won't be necessary (#2992) + assert len(labels) == N out = np.empty(N) - grp_sizes = np.ones(N, dtype=np.int64) - # If all 0 labels, can short-circuit later label + # If we don't care about labels, can short-circuit later label # comparisons - check_labels = np.any(labels) + check_labels = labels is not None - # For cases where a mask is not possible, we can avoid mask checks - check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) + # If this doesn't hold, we don't care about group sizes, so don't even allocate + if pct and (tiebreak == TIEBREAK_DENSE or check_labels): + grp_sizes = np.ones(N, dtype=np.int64) # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data @@ -1043,49 +1035,69 @@ def rank_1d( mask = (masked_vals == NPY_NAT).astype(np.uint8) elif rank_t is float64_t: mask = np.isnan(masked_vals).astype(np.uint8) + + # For cases where a mask is not possible, we can avoid mask checks + check_mask = mask is not None + + if check_mask: + # If `na_option == 'top'`, we want to assign the lowest rank + # to NaN regardless of ascending/descending. So if ascending, + # fill with lowest value of type to end up with lowest rank. + # If descending, fill with highest value since descending + # will flip the ordering to still end up with lowest rank. + # Symmetric logic applies to `na_option == 'bottom'` + nans_rank_highest = ascending ^ (na_option == 'top') + nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) + np.putmask(masked_vals, mask, nan_fill_val) + + # Depending on whether we care about labels and masks, we need + # different sorting criteria + + if check_mask and check_labels: + # lexsort using labels, then mask, then actual values + # each label corresponds to a different group value, + # the mask helps you differentiate missing values before + # performing sort on the actual values + if nans_rank_highest: + order = (masked_vals, mask, labels) + else: + order = (masked_vals, ~(np.asarray(mask)), labels) + elif check_mask: + if nans_rank_highest: + order = (masked_vals, mask) + else: + order = (masked_vals, ~(np.asarray(mask))) + elif check_labels: + order = (masked_vals, labels) else: - mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) - - # If `na_option == 'top'`, we want to assign the lowest rank - # to NaN regardless of ascending/descending. So if ascending, - # fill with lowest value of type to end up with lowest rank. - # If descending, fill with highest value since descending - # will flip the ordering to still end up with lowest rank. - # Symmetric logic applies to `na_option == 'bottom'` - nans_rank_highest = ascending ^ (na_option == 'top') - nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) - if nans_rank_highest: - order = (masked_vals, mask, labels) + order = None + + # lexsort is slower, so only use if we actually need to sort on multiple keys + if order is not None: + sort_indexer = np.lexsort(order).astype(np.intp, copy=False) else: - order = (masked_vals, ~(np.asarray(mask)), labels) + kind = "stable" if ties_method == "first" else None + sort_indexer = masked_vals.argsort(kind=kind).astype(np.intp, copy=False) + # print(np.array(sort_indexer)) - np.putmask(masked_vals, mask, nan_fill_val) # putmask doesn't accept a memoryview, so we assign as a separate step masked_vals_memview = masked_vals - # lexsort using labels, then mask, then actual values - # each label corresponds to a different group value, - # the mask helps you differentiate missing values before - # performing sort on the actual values - lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False) - if not ascending: - lexsort_indexer = lexsort_indexer[::-1] + sort_indexer = sort_indexer[::-1] with nogil: rank_sorted_1d( out, - grp_sizes, - labels, - lexsort_indexer, masked_vals_memview, - mask, - tiebreak, - check_mask, - check_labels, - keep_na, - pct, + sort_indexer, N, + mask=mask, + grp_sizes=grp_sizes, + tiebreak=tiebreak, + keep_na=keep_na, + pct=pct, + labels=labels, ) return np.asarray(out) @@ -1095,18 +1107,18 @@ def rank_1d( @cython.boundscheck(False) cdef void rank_sorted_1d( float64_t[::1] out, - int64_t[::1] grp_sizes, - const intp_t[:] labels, - const intp_t[:] sort_indexer, # Can make const with cython3 (https://github.com/cython/cython/issues/3222) rank_t[:] masked_vals, - const uint8_t[:] mask, - TiebreakEnumType tiebreak, - bint check_mask, - bint check_labels, - bint keep_na, - bint pct, + const intp_t[:] sort_indexer, Py_ssize_t N, + const uint8_t[:] mask=None, + int64_t[::1] grp_sizes=None, + TiebreakEnumType tiebreak=TIEBREAK_AVERAGE, + bint keep_na=True, + bint pct=False, + # https://github.com/cython/cython/issues/1630, only trailing arguments can + # currently be omitted for cdef functions, which is why we keep these at the end + const intp_t[:] labels=None, ) nogil: """ See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should @@ -1116,36 +1128,40 @@ cdef void rank_sorted_1d( ---------- out : float64_t[::1] Array to store computed ranks - grp_sizes : int64_t[::1] - Array to store group counts, only used if pct=True - labels : See rank_1d.__doc__ - sort_indexer : intp_t[:] - Array of indices which sorts masked_vals masked_vals : rank_t[:] The values input to rank_1d, with missing values replaced by fill values - mask : uint8_t[:] - Array where entries are True if the value is missing, False otherwise - tiebreak : TiebreakEnumType - See rank_1d.__doc__ for the different modes - check_mask : bool - If False, assumes the mask is all False to skip mask indexing - check_labels : bool - If False, assumes all labels are the same to skip group handling logic - keep_na : bool - Whether or not to keep nulls - pct : bool - Compute percentage rank of data within each group + sort_indexer : intp_t[:] + Array of indices which sorts masked_vals N : Py_ssize_t The number of elements to rank. Note: it is not always true that N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why) + mask : uint8_t[:], default None + Array where entries are True if the value is missing, False otherwise. None + implies the mask is all False + grp_sizes : int64_t[::1], default None + Array to store group counts, only used if pct=True. Should only be None + if labels is None. + tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE + See rank_1d.__doc__ for the different modes + keep_na : bool, default True + Whether or not to keep nulls + pct : bool, default False + Compute percentage rank of data within each group + labels : See rank_1d.__doc__, default None. None implies all labels are the same. """ cdef: Py_ssize_t i, j, dups=0, sum_ranks=0, Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0 - bint at_end, next_val_diff, group_changed + bint at_end, next_val_diff, group_changed, check_mask, check_labels + bint grp_size_needed int64_t grp_size + check_mask = mask is not None + check_labels = labels is not None + # Group size only needs to be tracked if we have groups or are doing dense ranking + grp_size_needed = pct and (check_labels or tiebreak == TIEBREAK_DENSE) + # Loop over the length of the value array # each incremental i value can be looked up in the lexsort_indexer # array that we sorted previously, which gives us the location of @@ -1245,8 +1261,9 @@ cdef void rank_sorted_1d( else: grp_size = grp_vals_seen - (grp_na_count > 0) - for j in range(grp_start, i + 1): - grp_sizes[sort_indexer[j]] = grp_size + if grp_size_needed: + for j in range(grp_start, i + 1): + grp_sizes[sort_indexer[j]] = grp_size dups = sum_ranks = 0 grp_na_count = 0 @@ -1345,8 +1362,9 @@ cdef void rank_sorted_1d( else: grp_size = grp_vals_seen - (grp_na_count > 0) - for j in range(grp_start, i + 1): - grp_sizes[sort_indexer[j]] = grp_size + if grp_size_needed: + for j in range(grp_start, i + 1): + grp_sizes[sort_indexer[j]] = grp_size dups = sum_ranks = 0 grp_na_count = 0 @@ -1354,9 +1372,15 @@ cdef void rank_sorted_1d( grp_vals_seen = 1 if pct: + # If we're grouping, use the computed group sizes, otherwise we can just + # use the data length + for i in range(N): - if grp_sizes[i] != 0: - out[i] = out[i] / grp_sizes[i] + if grp_size_needed: + if grp_sizes[i] != 0: + out[i] = out[i] / grp_sizes[i] + else: + out[i] = out[i] / N def rank_2d( @@ -1374,12 +1398,12 @@ def rank_2d( cdef: Py_ssize_t k, n, col float64_t[::1, :] out # Column-major so columns are contiguous - int64_t[::1, :] grp_sizes - const intp_t[:] labels ndarray[rank_t, ndim=2] values rank_t[:, :] masked_vals intp_t[:, :] sort_indexer - uint8_t[:, :] mask + uint8_t[:, :] mask=None + uint8_t[:] mask_arg=None + int64_t[::1] grp_sizes=None TiebreakEnumType tiebreak bint check_mask, keep_na, nans_rank_highest rank_t nan_fill_val @@ -1416,21 +1440,21 @@ def rank_2d( else: mask = (values == NPY_NAT).view(np.uint8) np.putmask(values, mask, nan_fill_val) - else: - mask = np.zeros_like(values, dtype=np.uint8) - - if nans_rank_highest: - order = (values, mask) - else: - order = (values, ~np.asarray(mask)) n, k = (values).shape out = np.empty((n, k), dtype='f8', order='F') - grp_sizes = np.ones((n, k), dtype='i8', order='F') - labels = np.zeros(n, dtype=np.intp) + + # If this doesn't hold, we don't care about group sizes, so don't even allocate + if pct and tiebreak == TIEBREAK_DENSE: + grp_sizes = np.ones(n, dtype=np.int64) # lexsort is slower, so only use if we need to worry about the mask if check_mask: + if nans_rank_highest: + order = (values, mask) + else: + order = (values, ~np.asarray(mask)) + sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False) else: kind = "stable" if ties_method == "first" else None @@ -1443,19 +1467,18 @@ def rank_2d( masked_vals = values with nogil: for col in range(k): + if mask is not None: + mask_arg = mask[:, col] rank_sorted_1d( out[:, col], - grp_sizes[:, col], - labels, - sort_indexer[:, col], masked_vals[:, col], - mask[:, col], - tiebreak, - check_mask, - False, - keep_na, - pct, + sort_indexer[:, col], n, + mask=mask_arg, + grp_sizes=grp_sizes, + tiebreak=tiebreak, + keep_na=keep_na, + pct=pct, ) if axis == 1: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7dcc83f76db75..2d108b599bbaf 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1008,7 +1008,6 @@ def rank( if values.ndim == 1: ranks = algos.rank_1d( values, - labels=np.zeros(len(values), dtype=np.intp), is_datetimelike=is_datetimelike, ties_method=method, ascending=ascending, diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 4df95d895e475..b4836dffffa06 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1747,7 +1747,7 @@ def test_scipy_compat(self): def _check(arr): mask = ~np.isfinite(arr) arr = arr.copy() - result = libalgos.rank_1d(arr, labels=np.zeros(len(arr), dtype=np.intp)) + result = libalgos.rank_1d(arr) arr[mask] = np.inf exp = rankdata(arr) exp[mask] = np.nan From f90b8d9726b6649dcb49be2a55da6541d377f913 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 17 Jun 2021 16:10:55 -0700 Subject: [PATCH 23/25] Clean up group sizes --- pandas/_libs/algos.pyx | 51 +++++++++++++----------------------------- 1 file changed, 16 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 6f77ae214e0d2..9d55535b55e2e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -995,7 +995,7 @@ def rank_1d( float64_t[::1] out ndarray[rank_t, ndim=1] masked_vals rank_t[:] masked_vals_memview - int64_t[::1] grp_sizes=None + int64_t[::1] grp_sizes uint8_t[:] mask=None bint keep_na, nans_rank_highest, check_labels, check_mask rank_t nan_fill_val @@ -1012,15 +1012,12 @@ def rank_1d( # TODO Cython 3.0: cast won't be necessary (#2992) assert len(labels) == N out = np.empty(N) + grp_sizes = np.ones(N, dtype=np.int64) # If we don't care about labels, can short-circuit later label # comparisons check_labels = labels is not None - # If this doesn't hold, we don't care about group sizes, so don't even allocate - if pct and (tiebreak == TIEBREAK_DENSE or check_labels): - grp_sizes = np.ones(N, dtype=np.int64) - # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array @@ -1052,7 +1049,6 @@ def rank_1d( # Depending on whether we care about labels and masks, we need # different sorting criteria - if check_mask and check_labels: # lexsort using labels, then mask, then actual values # each label corresponds to a different group value, @@ -1078,7 +1074,6 @@ def rank_1d( else: kind = "stable" if ties_method == "first" else None sort_indexer = masked_vals.argsort(kind=kind).astype(np.intp, copy=False) - # print(np.array(sort_indexer)) # putmask doesn't accept a memoryview, so we assign as a separate step masked_vals_memview = masked_vals @@ -1091,9 +1086,9 @@ def rank_1d( out, masked_vals_memview, sort_indexer, + grp_sizes, N, mask=mask, - grp_sizes=grp_sizes, tiebreak=tiebreak, keep_na=keep_na, pct=pct, @@ -1110,9 +1105,9 @@ cdef void rank_sorted_1d( # Can make const with cython3 (https://github.com/cython/cython/issues/3222) rank_t[:] masked_vals, const intp_t[:] sort_indexer, + int64_t[::1] grp_sizes, Py_ssize_t N, const uint8_t[:] mask=None, - int64_t[::1] grp_sizes=None, TiebreakEnumType tiebreak=TIEBREAK_AVERAGE, bint keep_na=True, bint pct=False, @@ -1132,15 +1127,15 @@ cdef void rank_sorted_1d( The values input to rank_1d, with missing values replaced by fill values sort_indexer : intp_t[:] Array of indices which sorts masked_vals + grp_sizes : int64_t[::1] + Array to store group counts, only used if pct=True. Should only be None + if labels is None. N : Py_ssize_t The number of elements to rank. Note: it is not always true that N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why) mask : uint8_t[:], default None Array where entries are True if the value is missing, False otherwise. None implies the mask is all False - grp_sizes : int64_t[::1], default None - Array to store group counts, only used if pct=True. Should only be None - if labels is None. tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE See rank_1d.__doc__ for the different modes keep_na : bool, default True @@ -1154,13 +1149,10 @@ cdef void rank_sorted_1d( Py_ssize_t i, j, dups=0, sum_ranks=0, Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0 bint at_end, next_val_diff, group_changed, check_mask, check_labels - bint grp_size_needed int64_t grp_size check_mask = mask is not None check_labels = labels is not None - # Group size only needs to be tracked if we have groups or are doing dense ranking - grp_size_needed = pct and (check_labels or tiebreak == TIEBREAK_DENSE) # Loop over the length of the value array # each incremental i value can be looked up in the lexsort_indexer @@ -1261,9 +1253,8 @@ cdef void rank_sorted_1d( else: grp_size = grp_vals_seen - (grp_na_count > 0) - if grp_size_needed: - for j in range(grp_start, i + 1): - grp_sizes[sort_indexer[j]] = grp_size + for j in range(grp_start, i + 1): + grp_sizes[sort_indexer[j]] = grp_size dups = sum_ranks = 0 grp_na_count = 0 @@ -1362,9 +1353,8 @@ cdef void rank_sorted_1d( else: grp_size = grp_vals_seen - (grp_na_count > 0) - if grp_size_needed: - for j in range(grp_start, i + 1): - grp_sizes[sort_indexer[j]] = grp_size + for j in range(grp_start, i + 1): + grp_sizes[sort_indexer[j]] = grp_size dups = sum_ranks = 0 grp_na_count = 0 @@ -1372,15 +1362,9 @@ cdef void rank_sorted_1d( grp_vals_seen = 1 if pct: - # If we're grouping, use the computed group sizes, otherwise we can just - # use the data length - for i in range(N): - if grp_size_needed: - if grp_sizes[i] != 0: - out[i] = out[i] / grp_sizes[i] - else: - out[i] = out[i] / N + if grp_sizes[i] != 0: + out[i] = out[i] / grp_sizes[i] def rank_2d( @@ -1401,9 +1385,9 @@ def rank_2d( ndarray[rank_t, ndim=2] values rank_t[:, :] masked_vals intp_t[:, :] sort_indexer + int64_t[::1] grp_sizes uint8_t[:, :] mask=None uint8_t[:] mask_arg=None - int64_t[::1] grp_sizes=None TiebreakEnumType tiebreak bint check_mask, keep_na, nans_rank_highest rank_t nan_fill_val @@ -1443,10 +1427,7 @@ def rank_2d( n, k = (values).shape out = np.empty((n, k), dtype='f8', order='F') - - # If this doesn't hold, we don't care about group sizes, so don't even allocate - if pct and tiebreak == TIEBREAK_DENSE: - grp_sizes = np.ones(n, dtype=np.int64) + grp_sizes = np.ones(n, dtype=np.int64) # lexsort is slower, so only use if we need to worry about the mask if check_mask: @@ -1473,9 +1454,9 @@ def rank_2d( out[:, col], masked_vals[:, col], sort_indexer[:, col], + grp_sizes, n, mask=mask_arg, - grp_sizes=grp_sizes, tiebreak=tiebreak, keep_na=keep_na, pct=pct, From d9b234216cd436e2c1381768df35971e221bbf7b Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 26 Jun 2021 15:20:24 -0400 Subject: [PATCH 24/25] Fixups --- pandas/_libs/algos.pyi | 2 +- pandas/_libs/algos.pyx | 136 ++++++++++++++++++----------------------- 2 files changed, 62 insertions(+), 76 deletions(-) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index c398d8d45c5b8..9da5534c51321 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -123,7 +123,7 @@ def is_monotonic( def rank_1d( values: np.ndarray, # ndarray[rank_t, ndim=1] - labels: np.ndarray | None, # const int64_t[:]=None + labels: np.ndarray | None = ..., # const int64_t[:]=None is_datetimelike: bool = ..., ties_method=..., ascending: bool = ..., diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a3aea22073803..1bfbe49e0a82f 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -991,12 +991,12 @@ def rank_1d( cdef: TiebreakEnumType tiebreak Py_ssize_t N - intp_t[:] sort_indexer + int64_t[::1] grp_sizes + intp_t[:] lexsort_indexer float64_t[::1] out ndarray[rank_t, ndim=1] masked_vals rank_t[:] masked_vals_memview - int64_t[::1] grp_sizes - uint8_t[:] mask=None + uint8_t[:] mask bint keep_na, nans_rank_highest, check_labels, check_mask rank_t nan_fill_val @@ -1018,6 +1018,9 @@ def rank_1d( # comparisons check_labels = labels is not None + # For cases where a mask is not possible, we can avoid mask checks + check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) + # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array @@ -1032,63 +1035,47 @@ def rank_1d( mask = (masked_vals == NPY_NAT).astype(np.uint8) elif rank_t is float64_t: mask = np.isnan(masked_vals).astype(np.uint8) - - # For cases where a mask is not possible, we can avoid mask checks - check_mask = mask is not None - - if check_mask: - # If `na_option == 'top'`, we want to assign the lowest rank - # to NaN regardless of ascending/descending. So if ascending, - # fill with lowest value of type to end up with lowest rank. - # If descending, fill with highest value since descending - # will flip the ordering to still end up with lowest rank. - # Symmetric logic applies to `na_option == 'bottom'` - nans_rank_highest = ascending ^ (na_option == 'top') - nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) - np.putmask(masked_vals, mask, nan_fill_val) - - # Depending on whether we care about labels and masks, we need - # different sorting criteria - if check_mask and check_labels: - # lexsort using labels, then mask, then actual values - # each label corresponds to a different group value, - # the mask helps you differentiate missing values before - # performing sort on the actual values - if nans_rank_highest: - order = (masked_vals, mask, labels) - else: - order = (masked_vals, ~(np.asarray(mask)), labels) - elif check_mask: - if nans_rank_highest: - order = (masked_vals, mask) - else: - order = (masked_vals, ~(np.asarray(mask))) - elif check_labels: - order = (masked_vals, labels) else: - order = None - - # lexsort is slower, so only use if we actually need to sort on multiple keys - if order is not None: - sort_indexer = np.lexsort(order).astype(np.intp, copy=False) + mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) + + # If `na_option == 'top'`, we want to assign the lowest rank + # to NaN regardless of ascending/descending. So if ascending, + # fill with lowest value of type to end up with lowest rank. + # If descending, fill with highest value since descending + # will flip the ordering to still end up with lowest rank. + # Symmetric logic applies to `na_option == 'bottom'` + nans_rank_highest = ascending ^ (na_option == 'top') + nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) + if nans_rank_highest: + order = [masked_vals, mask] else: - kind = "stable" if ties_method == "first" else None - sort_indexer = masked_vals.argsort(kind=kind).astype(np.intp, copy=False) + order = [masked_vals, ~(np.asarray(mask))] + + if check_labels: + order.append(labels) + np.putmask(masked_vals, mask, nan_fill_val) # putmask doesn't accept a memoryview, so we assign as a separate step masked_vals_memview = masked_vals + # lexsort using labels, then mask, then actual values + # each label corresponds to a different group value, + # the mask helps you differentiate missing values before + # performing sort on the actual values + lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False) + if not ascending: - sort_indexer = sort_indexer[::-1] + lexsort_indexer = lexsort_indexer[::-1] with nogil: rank_sorted_1d( out, - masked_vals_memview, - sort_indexer, grp_sizes, + lexsort_indexer, + masked_vals_memview, + mask, + check_mask, N, - mask=mask, tiebreak=tiebreak, keep_na=keep_na, pct=pct, @@ -1102,17 +1089,18 @@ def rank_1d( @cython.boundscheck(False) cdef void rank_sorted_1d( float64_t[::1] out, + int64_t[::1] grp_sizes, + const intp_t[:] sort_indexer, # Can make const with cython3 (https://github.com/cython/cython/issues/3222) rank_t[:] masked_vals, - const intp_t[:] sort_indexer, - int64_t[::1] grp_sizes, + const uint8_t[:] mask, + bint check_mask, Py_ssize_t N, - const uint8_t[:] mask=None, TiebreakEnumType tiebreak=TIEBREAK_AVERAGE, bint keep_na=True, bint pct=False, # https://github.com/cython/cython/issues/1630, only trailing arguments can - # currently be omitted for cdef functions, which is why we keep these at the end + # currently be omitted for cdef functions, which is why we keep this at the end const intp_t[:] labels=None, ) nogil: """ @@ -1123,19 +1111,20 @@ cdef void rank_sorted_1d( ---------- out : float64_t[::1] Array to store computed ranks - masked_vals : rank_t[:] - The values input to rank_1d, with missing values replaced by fill values - sort_indexer : intp_t[:] - Array of indices which sorts masked_vals grp_sizes : int64_t[::1] Array to store group counts, only used if pct=True. Should only be None if labels is None. + sort_indexer : intp_t[:] + Array of indices which sorts masked_vals + masked_vals : rank_t[:] + The values input to rank_1d, with missing values replaced by fill values + mask : uint8_t[:] + Array where entries are True if the value is missing, False otherwise. + check_mask : bool + If False, assumes the mask is all False to skip mask indexing N : Py_ssize_t The number of elements to rank. Note: it is not always true that N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why) - mask : uint8_t[:], default None - Array where entries are True if the value is missing, False otherwise. None - implies the mask is all False tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE See rank_1d.__doc__ for the different modes keep_na : bool, default True @@ -1148,10 +1137,9 @@ cdef void rank_sorted_1d( cdef: Py_ssize_t i, j, dups=0, sum_ranks=0, Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0 - bint at_end, next_val_diff, group_changed, check_mask, check_labels + bint at_end, next_val_diff, group_changed, check_labels int64_t grp_size - check_mask = mask is not None check_labels = labels is not None # Loop over the length of the value array @@ -1382,12 +1370,11 @@ def rank_2d( cdef: Py_ssize_t k, n, col float64_t[::1, :] out # Column-major so columns are contiguous + int64_t[::1] grp_sizes ndarray[rank_t, ndim=2] values rank_t[:, :] masked_vals intp_t[:, :] sort_indexer - int64_t[::1] grp_sizes - uint8_t[:, :] mask=None - uint8_t[:] mask_arg=None + uint8_t[:, :] mask TiebreakEnumType tiebreak bint check_mask, keep_na, nans_rank_highest rank_t nan_fill_val @@ -1424,18 +1411,18 @@ def rank_2d( else: mask = (values == NPY_NAT).view(np.uint8) np.putmask(values, mask, nan_fill_val) + else: + mask = np.zeros_like(values, dtype=np.uint8) + + if nans_rank_highest: + order = (values, mask) + else: + order = (values, ~np.asarray(mask)) n, k = (values).shape out = np.empty((n, k), dtype='f8', order='F') grp_sizes = np.ones(n, dtype=np.int64) - # lexsort is slower, so only use if we need to worry about the mask - if check_mask: - if nans_rank_highest: - order = (values, mask) - else: - order = (values, ~np.asarray(mask)) - # lexsort is slower, so only use if we need to worry about the mask if check_mask: sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False) @@ -1450,15 +1437,14 @@ def rank_2d( masked_vals = values with nogil: for col in range(k): - if mask is not None: - mask_arg = mask[:, col] rank_sorted_1d( out[:, col], - masked_vals[:, col], - sort_indexer[:, col], grp_sizes, + sort_indexer[:, col], + masked_vals[:, col], + mask[:, col], + check_mask, n, - mask=mask_arg, tiebreak=tiebreak, keep_na=keep_na, pct=pct, From e92d7c555dfaaf1cf3501b2643095309ae0d84dc Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 29 Jun 2021 22:47:43 -0400 Subject: [PATCH 25/25] array -> asarray and better arg calling --- pandas/_libs/algos.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 1bfbe49e0a82f..172f2bfb49160 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -448,8 +448,8 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr with gil: # We need to slice back to nobs because rank_1d will # require arrays of nobs length - rankedx = rank_1d(np.array(maskedx)[:nobs]) - rankedy = rank_1d(np.array(maskedy)[:nobs]) + rankedx = rank_1d(np.asarray(maskedx)[:nobs]) + rankedy = rank_1d(np.asarray(maskedy)[:nobs]) for i in range(nobs): maskedx[i] = rankedx[i] maskedy[i] = rankedy[i] @@ -1074,8 +1074,8 @@ def rank_1d( lexsort_indexer, masked_vals_memview, mask, - check_mask, - N, + check_mask=check_mask, + N=N, tiebreak=tiebreak, keep_na=keep_na, pct=pct, @@ -1443,8 +1443,8 @@ def rank_2d( sort_indexer[:, col], masked_vals[:, col], mask[:, col], - check_mask, - n, + check_mask=check_mask, + N=n, tiebreak=tiebreak, keep_na=keep_na, pct=pct,