From 23643305b2b42c3f76f6982c3b98960283fee594 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 18:22:19 -0400 Subject: [PATCH 01/16] CLN: rank_1d followup --- pandas/_libs/algos.pyx | 86 +++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 047eb848b7540..acbe114db82c6 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -951,8 +951,9 @@ def rank_1d( ndarray[float64_t, ndim=1] grp_sizes, out ndarray[rank_t, ndim=1] masked_vals ndarray[uint8_t, ndim=1] mask - bint keep_na, at_end, next_val_diff, check_labels + bint keep_na, at_end, next_val_diff, check_labels, set_as_na rank_t nan_fill_val + float computed_rank tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' @@ -1037,11 +1038,8 @@ def rank_1d( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if not at_end: - next_val_diff = are_diff(masked_vals[lexsort_indexer[i]], - masked_vals[lexsort_indexer[i+1]]) - else: - next_val_diff = True + next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], + masked_vals[lexsort_indexer[i+1]]) if (next_val_diff or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) @@ -1051,28 +1049,32 @@ def rank_1d( ): # if keep_na, check for missing values and assign back # to the result where appropriate - if keep_na and mask[lexsort_indexer[i]]: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = NaN - grp_na_count = dups + set_as_na = keep_na and mask[lexsort_indexer[i]] + + # For all cases except TIEBREAK_FIRST when not setting + # nulls, we set the same value at each index + if set_as_na: + computed_rank = NaN + grp_na_count = dups elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = sum_ranks / dups + computed_rank = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start - dups + 2 + computed_rank = i - grp_start - dups + 2 elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: + computed_rank = i - grp_start + 1 + elif tiebreak == TIEBREAK_DENSE: + computed_rank = grp_vals_seen + else: for j in range(i - dups + 1, i + 1): if ascending: out[lexsort_indexer[j]] = j + 1 - grp_start else: - out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: + out[lexsort_indexer[j]] = \ + (2 * i - j - dups + 2 - grp_start) + + if set_as_na or tiebreak != TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = grp_vals_seen + out[lexsort_indexer[j]] = computed_rank # look forward to the next value (using the sorting in _as) # if the value does not equal the current value then we need to @@ -1083,7 +1085,6 @@ def rank_1d( ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 - grp_tie_count += 1 # Similar to the previous conditional, check now if we are # moving to a new group. If so, keep track of the index where @@ -1102,10 +1103,9 @@ def rank_1d( else: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ - (grp_tie_count - (grp_na_count > 0)) + (grp_vals_seen - 1 - (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 - grp_tie_count = 0 grp_start = i + 1 grp_vals_seen = 1 else: @@ -1124,11 +1124,8 @@ def rank_1d( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if not at_end: - next_val_diff = (masked_vals[lexsort_indexer[i]] - != masked_vals[lexsort_indexer[i+1]]) - else: - next_val_diff = True + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] + != masked_vals[lexsort_indexer[i+1]]) if (next_val_diff or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) @@ -1138,29 +1135,32 @@ def rank_1d( ): # if keep_na, check for missing values and assign back # to the result where appropriate - if keep_na and mask[lexsort_indexer[i]]: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = NaN - grp_na_count = dups + set_as_na = keep_na and mask[lexsort_indexer[i]] + + # For all cases except TIEBREAK_FIRST when not setting + # nulls, we set the same value at each index + if set_as_na: + computed_rank = NaN + grp_na_count = dups elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = sum_ranks / dups + computed_rank = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start - dups + 2 + computed_rank = i - grp_start - dups + 2 elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: + computed_rank = i - grp_start + 1 + elif tiebreak == TIEBREAK_DENSE: + computed_rank = grp_vals_seen + else: for j in range(i - dups + 1, i + 1): if ascending: out[lexsort_indexer[j]] = j + 1 - grp_start else: out[lexsort_indexer[j]] = \ (2 * i - j - dups + 2 - grp_start) - elif tiebreak == TIEBREAK_DENSE: + + if set_as_na or tiebreak != TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = grp_vals_seen + out[lexsort_indexer[j]] = computed_rank # look forward to the next value (using the sorting in # lexsort_indexer) if the value does not equal the current @@ -1171,7 +1171,6 @@ def rank_1d( ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 - grp_tie_count += 1 # Similar to the previous conditional, check now if we are # moving to a new group. If so, keep track of the index where @@ -1189,10 +1188,9 @@ def rank_1d( else: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ - (grp_tie_count - (grp_na_count > 0)) + (grp_vals_seen - 1 - (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 - grp_tie_count = 0 grp_start = i + 1 grp_vals_seen = 1 From 999d8802bd0044ba37f589eb3b0ef7ea4b17916a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 19:00:35 -0400 Subject: [PATCH 02/16] WIP --- pandas/_libs/algos.pyx | 47 ++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index acbe114db82c6..caec06d498312 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -963,6 +963,7 @@ def rank_1d( assert len(labels) == N out = np.empty(N) grp_sizes = np.ones(N) + # If all 0 labels, can short-circuit later label # comparisons check_labels = np.any(labels) @@ -1026,6 +1027,7 @@ def rank_1d( if rank_t is object: for i in range(N): at_end = i == N - 1 + # dups and sum_ranks will be incremented each loop where # the value / group remains the same, and should be reset # when either of those change @@ -1033,20 +1035,23 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 + next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], + masked_vals[lexsort_indexer[i+1]]) + + # We'll need this check later anyway to determine group size, so just + # compute it here since shortcircuiting won't help + group_changed = at_end or (check_labels and + (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])) + # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], - masked_vals[lexsort_indexer[i+1]]) + if (next_val_diff or group_changed + or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): - if (next_val_diff - or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) - or (check_labels - and (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) - ): # if keep_na, check for missing values and assign back # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] @@ -1092,10 +1097,7 @@ def rank_1d( # decrement that from their position. fill in the size of each # group encountered (used by pct calculations later). also be # sure to reset any of the items helping to calculate dups - if (at_end or - (check_labels - and (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]]))): + if group_changed: if tiebreak != TIEBREAK_DENSE: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ @@ -1112,6 +1114,7 @@ def rank_1d( with nogil: for i in range(N): at_end = i == N - 1 + # dups and sum_ranks will be incremented each loop where # the value / group remains the same, and should be reset # when either of those change @@ -1119,14 +1122,20 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] + != masked_vals[lexsort_indexer[i+1]]) + + # We'll need this check later anyway to determine group size, so just + # compute it here since shortcircuiting won't help + group_changed = at_end or (check_labels and + (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])) + # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] - != masked_vals[lexsort_indexer[i+1]]) - if (next_val_diff or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) or (check_labels @@ -1137,8 +1146,8 @@ def rank_1d( # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] - # For all cases except TIEBREAK_FIRST when not setting - # nulls, we set the same value at each index + # For all cases except TIEBREAK_FIRST and a non-null value, + # we set the same value at each index if set_as_na: computed_rank = NaN grp_na_count = dups @@ -1178,9 +1187,7 @@ def rank_1d( # decrement that from their position. fill in the size of each # group encountered (used by pct calculations later). also be # sure to reset any of the items helping to calculate dups - if at_end or (check_labels and - (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])): + if group_changed: if tiebreak != TIEBREAK_DENSE: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ From d360871bd03f62c4fd245ea1d026f2e804c86af9 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 19:13:43 -0400 Subject: [PATCH 03/16] WIP --- pandas/_libs/algos.pyx | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index caec06d498312..cb968d9e8b1e6 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -951,7 +951,7 @@ def rank_1d( ndarray[float64_t, ndim=1] grp_sizes, out ndarray[rank_t, ndim=1] masked_vals ndarray[uint8_t, ndim=1] mask - bint keep_na, at_end, next_val_diff, check_labels, set_as_na + bint keep_na, at_end, next_val_diff, check_labels, set_as_na, group_changed rank_t nan_fill_val float computed_rank @@ -1086,8 +1086,7 @@ def rank_1d( # reset the dups and sum_ranks, knowing that a new value is # coming up. the conditional also needs to handle nan equality # and the end of iteration - if next_val_diff or (mask[lexsort_indexer[i]] - ^ mask[lexsort_indexer[i+1]]): + if next_val_diff or not group_changed: dups = sum_ranks = 0 grp_vals_seen += 1 @@ -1122,32 +1121,29 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] - != masked_vals[lexsort_indexer[i+1]]) + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != + masked_vals[lexsort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help group_changed = at_end or (check_labels and - (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) + (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])) # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if (next_val_diff - or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) - or (check_labels - and (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) - ): + if (next_val_diff or group_changed + or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): + # if keep_na, check for missing values and assign back # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] - # For all cases except TIEBREAK_FIRST and a non-null value, - # we set the same value at each index + # For all cases except TIEBREAK_FIRST when not setting + # nulls, we set the same value at each index if set_as_na: computed_rank = NaN grp_na_count = dups @@ -1171,13 +1167,12 @@ def rank_1d( for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = computed_rank - # look forward to the next value (using the sorting in - # lexsort_indexer) if the value does not equal the current - # value then we need to reset the dups and sum_ranks, - # knowing that a new value is coming up. the conditional - # also needs to handle nan equality and the end of iteration - if next_val_diff or (mask[lexsort_indexer[i]] - ^ mask[lexsort_indexer[i+1]]): + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. the conditional also needs to handle nan equality + # and the end of iteration + if next_val_diff or not group_changed: dups = sum_ranks = 0 grp_vals_seen += 1 From 0aaeee71b6cda176fc89946b32f333d382381a7d Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 19:21:22 -0400 Subject: [PATCH 04/16] WIP --- pandas/_libs/algos.pyx | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index cb968d9e8b1e6..d06ab47c116e9 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -985,6 +985,8 @@ def rank_1d( else: mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) + # If ascending is true and na_option == 'bottom', + # fill with the largest so NaN if ascending ^ (na_option == 'top'): if rank_t is object: nan_fill_val = Infinity() @@ -1030,13 +1032,12 @@ def rank_1d( # dups and sum_ranks will be incremented each loop where # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers + # when either of those change. Used to calculate tiebreakers dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], - masked_vals[lexsort_indexer[i+1]]) + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != + masked_vals[lexsort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help @@ -1052,7 +1053,7 @@ def rank_1d( if (next_val_diff or group_changed or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): - # if keep_na, check for missing values and assign back + # If keep_na, check for missing values and assign back # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] @@ -1081,11 +1082,15 @@ def rank_1d( for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = computed_rank - # look forward to the next value (using the sorting in _as) + # Look forward to the next value (using the sorting in lexsort_indexer) # if the value does not equal the current value then we need to # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality + # coming up. The conditional also needs to handle nan equality # and the end of iteration + + # This condition is equivalent to `next_val_diff or + # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` + # Helps potentially avoid 2 mask lookups if next_val_diff or not group_changed: dups = sum_ranks = 0 grp_vals_seen += 1 @@ -1093,8 +1098,8 @@ def rank_1d( # Similar to the previous conditional, check now if we are # moving to a new group. If so, keep track of the index where # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be + # decrement that from their position. Fill in the size of each + # group encountered (used by pct calculations later). Also be # sure to reset any of the items helping to calculate dups if group_changed: if tiebreak != TIEBREAK_DENSE: @@ -1116,8 +1121,7 @@ def rank_1d( # dups and sum_ranks will be incremented each loop where # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers + # when either of those change. Used to calculate tiebreakers dups += 1 sum_ranks += i - grp_start + 1 @@ -1138,7 +1142,7 @@ def rank_1d( if (next_val_diff or group_changed or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): - # if keep_na, check for missing values and assign back + # If keep_na, check for missing values and assign back # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] @@ -1167,11 +1171,15 @@ def rank_1d( for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = computed_rank - # look forward to the next value (using the sorting in _as) + # Look forward to the next value (using the sorting in lexsort_indexer) # if the value does not equal the current value then we need to # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality + # coming up. The conditional also needs to handle nan equality # and the end of iteration + + # This condition is equivalent to `next_val_diff or + # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` + # Helps potentially avoid 2 mask lookups if next_val_diff or not group_changed: dups = sum_ranks = 0 grp_vals_seen += 1 @@ -1179,8 +1187,8 @@ def rank_1d( # Similar to the previous conditional, check now if we are # moving to a new group. If so, keep track of the index where # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be + # decrement that from their position. Fill in the size of each + # group encountered (used by pct calculations later). Also be # sure to reset any of the items helping to calculate dups if group_changed: if tiebreak != TIEBREAK_DENSE: From fe6495a289b97d21a5bb559b09813eebe2781213 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 19:54:33 -0400 Subject: [PATCH 05/16] Add comments, whitespace --- pandas/_libs/algos.pyx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d06ab47c116e9..d24c1452be9c4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -985,8 +985,9 @@ def rank_1d( else: mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) - # If ascending is true and na_option == 'bottom', - # fill with the largest so NaN + # If ascending and na_option == 'bottom' or descending and + # na_option == 'top' -> we want to rank NaN as the highest + # so fill with the maximum value for the type if ascending ^ (na_option == 'top'): if rank_t is object: nan_fill_val = Infinity() @@ -997,6 +998,8 @@ def rank_1d( else: nan_fill_val = np.inf order = (masked_vals, mask, labels) + + # Otherwise, fill with the lowest value of the type else: if rank_t is object: nan_fill_val = NegInfinity() @@ -1036,8 +1039,8 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != - masked_vals[lexsort_indexer[i+1]]) + next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], + masked_vals[lexsort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help @@ -1058,7 +1061,7 @@ def rank_1d( set_as_na = keep_na and mask[lexsort_indexer[i]] # For all cases except TIEBREAK_FIRST when not setting - # nulls, we set the same value at each index + # nulls, we can set the same value at each index if set_as_na: computed_rank = NaN grp_na_count = dups From 8fae616f2edfd6dc2936f23c8273ad7e006fdf06 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 20:17:35 -0400 Subject: [PATCH 06/16] Simplify conditional --- pandas/_libs/algos.pyx | 82 ++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d24c1452be9c4..a016dbcc07280 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1060,19 +1060,25 @@ def rank_1d( # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] - # For all cases except TIEBREAK_FIRST when not setting - # nulls, we can set the same value at each index - if set_as_na: - computed_rank = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - computed_rank = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - computed_rank = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - computed_rank = i - grp_start + 1 - elif tiebreak == TIEBREAK_DENSE: - computed_rank = grp_vals_seen + # For all cases except TIEBREAK_FIRST for non-null values + # we set the same value at each index + if set_as_na or tiebreak != TIEBREAK_FIRST: + if set_as_na: + computed_rank = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + computed_rank = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + computed_rank = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + computed_rank = i - grp_start + 1 + elif tiebreak == TIEBREAK_DENSE: + computed_rank = grp_vals_seen + + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = computed_rank + + # Otherwise, we need to iterate a compute a rank per index else: for j in range(i - dups + 1, i + 1): if ascending: @@ -1081,10 +1087,6 @@ def rank_1d( out[lexsort_indexer[j]] = \ (2 * i - j - dups + 2 - grp_start) - if set_as_na or tiebreak != TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = computed_rank - # Look forward to the next value (using the sorting in lexsort_indexer) # if the value does not equal the current value then we need to # reset the dups and sum_ranks, knowing that a new value is @@ -1149,19 +1151,25 @@ def rank_1d( # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] - # For all cases except TIEBREAK_FIRST when not setting - # nulls, we set the same value at each index - if set_as_na: - computed_rank = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - computed_rank = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - computed_rank = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - computed_rank = i - grp_start + 1 - elif tiebreak == TIEBREAK_DENSE: - computed_rank = grp_vals_seen + # For all cases except TIEBREAK_FIRST for non-null values + # we set the same value at each index + if set_as_na or tiebreak != TIEBREAK_FIRST: + if set_as_na: + computed_rank = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + computed_rank = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + computed_rank = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + computed_rank = i - grp_start + 1 + elif tiebreak == TIEBREAK_DENSE: + computed_rank = grp_vals_seen + + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = computed_rank + + # Otherwise, we need to iterate a compute a rank per index else: for j in range(i - dups + 1, i + 1): if ascending: @@ -1170,15 +1178,11 @@ def rank_1d( out[lexsort_indexer[j]] = \ (2 * i - j - dups + 2 - grp_start) - if set_as_na or tiebreak != TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = computed_rank - - # Look forward to the next value (using the sorting in lexsort_indexer) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. The conditional also needs to handle nan equality - # and the end of iteration + # Look forward to the next value (using the sorting in + # lexsort_indexer). If the value does not equal the current + # value then we need to reset the dups and sum_ranks, knowing + # that a new value is coming up. The conditional also needs + # to handle nan equality and the end of iteration # This condition is equivalent to `next_val_diff or # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` From f9479e377d5ae3c7ee19db84a85816d25b2b7ffd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 20:35:02 -0400 Subject: [PATCH 07/16] Remove unused var --- pandas/_libs/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a016dbcc07280..b03d06bfba98b 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -946,7 +946,7 @@ def rank_1d( cdef: TiebreakEnumType tiebreak Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0 ndarray[int64_t, ndim=1] lexsort_indexer ndarray[float64_t, ndim=1] grp_sizes, out ndarray[rank_t, ndim=1] masked_vals From a2bea3d933113f1af2253b9b411a1d94de191361 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 21 Mar 2021 01:07:07 -0400 Subject: [PATCH 08/16] Avoid compiler warning --- pandas/_libs/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index b03d06bfba98b..ca0b1c19aec60 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -953,7 +953,7 @@ def rank_1d( ndarray[uint8_t, ndim=1] mask bint keep_na, at_end, next_val_diff, check_labels, set_as_na, group_changed rank_t nan_fill_val - float computed_rank + float64_t computed_rank = 0 tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' From a6fd0a5024a8364dc565a786a482400b9e960f00 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 22 Mar 2021 16:45:49 -0400 Subject: [PATCH 09/16] BUG: groupby rank percentile --- pandas/_libs/algos.pyx | 226 ++++++++++++++++-------------- pandas/tests/groupby/test_rank.py | 12 ++ 2 files changed, 130 insertions(+), 108 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index ca0b1c19aec60..acd5027462848 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -953,7 +953,7 @@ def rank_1d( ndarray[uint8_t, ndim=1] mask bint keep_na, at_end, next_val_diff, check_labels, set_as_na, group_changed rank_t nan_fill_val - float64_t computed_rank = 0 + float64_t computed_rank = 0, grp_size tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' @@ -1039,8 +1039,8 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], - masked_vals[lexsort_indexer[i+1]]) + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != + masked_vals[lexsort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help @@ -1087,16 +1087,15 @@ def rank_1d( out[lexsort_indexer[j]] = \ (2 * i - j - dups + 2 - grp_start) - # Look forward to the next value (using the sorting in lexsort_indexer) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. The conditional also needs to handle nan equality - # and the end of iteration - - # This condition is equivalent to `next_val_diff or - # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` - # Helps potentially avoid 2 mask lookups - if next_val_diff or not group_changed: + # Look forward to the next value (using the sorting in + # lexsort_indexer). If the value does not equal the current + # value then we need to reset the dups and sum_ranks, knowing + # that a new value is coming up. The conditional also needs + # to handle nan equality and the end of iteration. If group + # changes we do not record seeing a new value in the group + if not group_changed and (next_val_diff or + (mask[lexsort_indexer[i]] + ^ mask[lexsort_indexer[i+1]])): dups = sum_ranks = 0 grp_vals_seen += 1 @@ -1107,110 +1106,121 @@ def rank_1d( # group encountered (used by pct calculations later). Also be # sure to reset any of the items helping to calculate dups if group_changed: + + # If not dense tiebreak, group size used to compute + # percentile will be # of non-null elements in group if tiebreak != TIEBREAK_DENSE: - for j in range(grp_start, i + 1): - grp_sizes[lexsort_indexer[j]] = \ - (i - grp_start + 1 - grp_na_count) + grp_size = i - grp_start + 1 - grp_na_count + + # Otherwise, it will be the number of distinct values + # in the group, subtracting 1 if NaNs are present + # since that is a distinct value we shouldn't count else: - for j in range(grp_start, i + 1): - grp_sizes[lexsort_indexer[j]] = \ - (grp_vals_seen - 1 - (grp_na_count > 0)) + grp_size = grp_vals_seen - (grp_na_count > 0) + + for j in range(grp_start, i + 1): + grp_sizes[lexsort_indexer[j]] = grp_size + dups = sum_ranks = 0 grp_na_count = 0 grp_start = i + 1 grp_vals_seen = 1 else: - with nogil: - for i in range(N): - at_end = i == N - 1 - - # dups and sum_ranks will be incremented each loop where - # the value / group remains the same, and should be reset - # when either of those change. Used to calculate tiebreakers - dups += 1 - sum_ranks += i - grp_start + 1 - - next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != - masked_vals[lexsort_indexer[i+1]]) - - # We'll need this check later anyway to determine group size, so just - # compute it here since shortcircuiting won't help - group_changed = at_end or (check_labels and - (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) - - # Update out only when there is a transition of values or labels. - # When a new value or group is encountered, go back #dups steps( - # the number of occurrence of current value) and assign the ranks - # based on the starting index of the current group (grp_start) - # and the current index - if (next_val_diff or group_changed - or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): - - # If keep_na, check for missing values and assign back - # to the result where appropriate - set_as_na = keep_na and mask[lexsort_indexer[i]] - - # For all cases except TIEBREAK_FIRST for non-null values - # we set the same value at each index - if set_as_na or tiebreak != TIEBREAK_FIRST: - if set_as_na: - computed_rank = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - computed_rank = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - computed_rank = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - computed_rank = i - grp_start + 1 - elif tiebreak == TIEBREAK_DENSE: - computed_rank = grp_vals_seen - - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = computed_rank - - # Otherwise, we need to iterate a compute a rank per index - else: - for j in range(i - dups + 1, i + 1): - if ascending: - out[lexsort_indexer[j]] = j + 1 - grp_start - else: - out[lexsort_indexer[j]] = \ - (2 * i - j - dups + 2 - grp_start) - - # Look forward to the next value (using the sorting in - # lexsort_indexer). If the value does not equal the current - # value then we need to reset the dups and sum_ranks, knowing - # that a new value is coming up. The conditional also needs - # to handle nan equality and the end of iteration - - # This condition is equivalent to `next_val_diff or - # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` - # Helps potentially avoid 2 mask lookups - if next_val_diff or not group_changed: - dups = sum_ranks = 0 - grp_vals_seen += 1 - - # Similar to the previous conditional, check now if we are - # moving to a new group. If so, keep track of the index where - # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. Fill in the size of each - # group encountered (used by pct calculations later). Also be - # sure to reset any of the items helping to calculate dups - if group_changed: - if tiebreak != TIEBREAK_DENSE: - for j in range(grp_start, i + 1): - grp_sizes[lexsort_indexer[j]] = \ - (i - grp_start + 1 - grp_na_count) + for i in range(N): + at_end = i == N - 1 + + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change. Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != + masked_vals[lexsort_indexer[i+1]]) + + # We'll need this check later anyway to determine group size, so just + # compute it here since shortcircuiting won't help + group_changed = at_end or (check_labels and + (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])) + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the starting index of the current group (grp_start) + # and the current index + if (next_val_diff or group_changed + or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): + + # If keep_na, check for missing values and assign back + # to the result where appropriate + set_as_na = keep_na and mask[lexsort_indexer[i]] + + # For all cases except TIEBREAK_FIRST for non-null values + # we set the same value at each index + if set_as_na or tiebreak != TIEBREAK_FIRST: + if set_as_na: + computed_rank = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + computed_rank = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + computed_rank = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + computed_rank = i - grp_start + 1 + elif tiebreak == TIEBREAK_DENSE: + computed_rank = grp_vals_seen + + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = computed_rank + + # Otherwise, we need to iterate a compute a rank per index + else: + for j in range(i - dups + 1, i + 1): + if ascending: + out[lexsort_indexer[j]] = j + 1 - grp_start else: - for j in range(grp_start, i + 1): - grp_sizes[lexsort_indexer[j]] = \ - (grp_vals_seen - 1 - (grp_na_count > 0)) - dups = sum_ranks = 0 - grp_na_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 + out[lexsort_indexer[j]] = \ + (2 * i - j - dups + 2 - grp_start) + # Look forward to the next value (using the sorting in + # lexsort_indexer). If the value does not equal the current + # value then we need to reset the dups and sum_ranks, knowing + # that a new value is coming up. The conditional also needs + # to handle nan equality and the end of iteration. If group + # changes we do not record seeing a new value in the group + if not group_changed and (next_val_diff or + (mask[lexsort_indexer[i]] + ^ mask[lexsort_indexer[i+1]])): + dups = sum_ranks = 0 + grp_vals_seen += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. Fill in the size of each + # group encountered (used by pct calculations later). Also be + # sure to reset any of the items helping to calculate dups + if group_changed: + + # If not dense tiebreak, group size used to compute + # percentile will be # of non-null elements in group + if tiebreak != TIEBREAK_DENSE: + grp_size = i - grp_start + 1 - grp_na_count + + # Otherwise, it will be the number of distinct values + # in the group, subtracting 1 if NaNs are present + # since that is a distinct value we shouldn't count + else: + grp_size = grp_vals_seen - (grp_na_count > 0) + + for j in range(grp_start, i + 1): + grp_sizes[lexsort_indexer[j]] = grp_size + + dups = sum_ranks = 0 + grp_na_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 if pct: for i in range(N): if grp_sizes[i] != 0: diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 6116703ebd174..312aff3ca17e8 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -517,3 +517,15 @@ def test_rank_zero_div(input_key, input_value, output_value): result = df.groupby("A").rank(method="dense", pct=True) expected = DataFrame({"B": output_value}) tm.assert_frame_equal(result, expected) + + +def test_rank_equal_values_on_group_transition(): + # GH#40518 + df = pd.DataFrame([ + [2, 2], + [3, 3], + [2, 3], + ], columns=["group", "val"]) + result = df.groupby(["gr1"])["val"].rank( + method="dense", pct=True, na_option='keep', +) \ No newline at end of file From 8a7b535e672caa3a78ae185568dca13135998c00 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 22 Mar 2021 16:52:47 -0400 Subject: [PATCH 10/16] Add tests --- pandas/tests/groupby/test_rank.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 312aff3ca17e8..c72205796d50b 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -519,13 +519,24 @@ def test_rank_zero_div(input_key, input_value, output_value): tm.assert_frame_equal(result, expected) -def test_rank_equal_values_on_group_transition(): +@pytest.mark.parametrize("fill_value", [np.nan, 3]) +def test_rank_pct_equal_values_on_group_transition(fill_value): # GH#40518 - df = pd.DataFrame([ - [2, 2], - [3, 3], - [2, 3], - ], columns=["group", "val"]) - result = df.groupby(["gr1"])["val"].rank( - method="dense", pct=True, na_option='keep', -) \ No newline at end of file + df = DataFrame( + [ + [-1, 2], + [1, fill_value], + [-1, fill_value], + ], + columns=["group", "val"], + ) + result = df.groupby(["group"])["val"].rank( + method="dense", + pct=True, + ) + if fill_value == 3: + expected = Series([0.5, 1, 1], name="val") + else: + expected = Series([1, np.nan, np.nan], name="val") + + tm.assert_series_equal(result, expected) From 0d555936ba492143e67d33a6532443f6beb0f8ce Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 22 Mar 2021 17:50:07 -0400 Subject: [PATCH 11/16] WIP --- pandas/_libs/algos.pyx | 1 + pandas/tests/groupby/test_rank.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index acd5027462848..ec21ead077f57 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1126,6 +1126,7 @@ def rank_1d( grp_start = i + 1 grp_vals_seen = 1 else: + for i in range(N): at_end = i == N - 1 diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index c72205796d50b..d21767db63871 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -519,12 +519,14 @@ def test_rank_zero_div(input_key, input_value, output_value): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("fill_value", [np.nan, 3]) -def test_rank_pct_equal_values_on_group_transition(fill_value): +@pytest.mark.parametrize("use_nan", [True, False]) +def test_rank_pct_equal_values_on_group_transition(use_nan): # GH#40518 + fill_value = np.nan if use_nan else 2 df = DataFrame( [ - [-1, 2], + [-1, 1], + [-1, 1], [1, fill_value], [-1, fill_value], ], @@ -534,9 +536,9 @@ def test_rank_pct_equal_values_on_group_transition(fill_value): method="dense", pct=True, ) - if fill_value == 3: - expected = Series([0.5, 1, 1], name="val") + if use_nan: + expected = Series([1, 1, np.nan, np.nan], name="val") else: - expected = Series([1, np.nan, np.nan], name="val") + expected = Series([0.5, 0.5, 1, 1], name="val") tm.assert_series_equal(result, expected) From ba5dc7cc6ab1e1a103dcef775acf3992f5775981 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 22 Mar 2021 17:58:39 -0400 Subject: [PATCH 12/16] Simplify changes --- pandas/_libs/algos.pyx | 129 ++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 72 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index ca0b1c19aec60..281d2b9a38b2f 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -951,11 +951,14 @@ def rank_1d( ndarray[float64_t, ndim=1] grp_sizes, out ndarray[rank_t, ndim=1] masked_vals ndarray[uint8_t, ndim=1] mask - bint keep_na, at_end, next_val_diff, check_labels, set_as_na, group_changed + bint keep_na, at_end, next_val_diff, check_labels, group_changed rank_t nan_fill_val - float64_t computed_rank = 0 tiebreak = tiebreakers[ties_method] + if tiebreak == TIEBREAK_FIRST: + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + keep_na = na_option == 'keep' N = len(values) @@ -1058,45 +1061,36 @@ def rank_1d( # If keep_na, check for missing values and assign back # to the result where appropriate - set_as_na = keep_na and mask[lexsort_indexer[i]] - - # For all cases except TIEBREAK_FIRST for non-null values - # we set the same value at each index - if set_as_na or tiebreak != TIEBREAK_FIRST: - if set_as_na: - computed_rank = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - computed_rank = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - computed_rank = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - computed_rank = i - grp_start + 1 - elif tiebreak == TIEBREAK_DENSE: - computed_rank = grp_vals_seen - + if keep_na and mask[lexsort_indexer[i]]: + grp_na_count = dups for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = computed_rank - - # Otherwise, we need to iterate a compute a rank per index - else: + out[lexsort_indexer[j]] = NaN + elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): - if ascending: - out[lexsort_indexer[j]] = j + 1 - grp_start - else: - out[lexsort_indexer[j]] = \ - (2 * i - j - dups + 2 - grp_start) + out[lexsort_indexer[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = j + 1 - grp_start + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = grp_vals_seen # Look forward to the next value (using the sorting in lexsort_indexer) # if the value does not equal the current value then we need to # reset the dups and sum_ranks, knowing that a new value is # coming up. The conditional also needs to handle nan equality # and the end of iteration - - # This condition is equivalent to `next_val_diff or - # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` - # Helps potentially avoid 2 mask lookups - if next_val_diff or not group_changed: + if next_val_diff or (mask[lexsort_indexer[i]] + ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 @@ -1130,8 +1124,8 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != - masked_vals[lexsort_indexer[i+1]]) + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] + != masked_vals[lexsort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help @@ -1149,45 +1143,36 @@ def rank_1d( # If keep_na, check for missing values and assign back # to the result where appropriate - set_as_na = keep_na and mask[lexsort_indexer[i]] - - # For all cases except TIEBREAK_FIRST for non-null values - # we set the same value at each index - if set_as_na or tiebreak != TIEBREAK_FIRST: - if set_as_na: - computed_rank = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - computed_rank = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - computed_rank = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - computed_rank = i - grp_start + 1 - elif tiebreak == TIEBREAK_DENSE: - computed_rank = grp_vals_seen - + if keep_na and mask[lexsort_indexer[i]]: + grp_na_count = dups for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = computed_rank - - # Otherwise, we need to iterate a compute a rank per index - else: + out[lexsort_indexer[j]] = NaN + elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): - if ascending: - out[lexsort_indexer[j]] = j + 1 - grp_start - else: - out[lexsort_indexer[j]] = \ - (2 * i - j - dups + 2 - grp_start) - - # Look forward to the next value (using the sorting in - # lexsort_indexer). If the value does not equal the current - # value then we need to reset the dups and sum_ranks, knowing - # that a new value is coming up. The conditional also needs - # to handle nan equality and the end of iteration - - # This condition is equivalent to `next_val_diff or - # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` - # Helps potentially avoid 2 mask lookups - if next_val_diff or not group_changed: + out[lexsort_indexer[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = j + 1 - grp_start + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = grp_vals_seen + + # Look forward to the next value (using the sorting in lexsort_indexer) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. The conditional also needs to handle nan equality + # and the end of iteration + if next_val_diff or (mask[lexsort_indexer[i]] + ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 From f6a04b79377bb94dab67a9ab938c1e5939d16325 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 22 Mar 2021 18:08:02 -0400 Subject: [PATCH 13/16] precommit fixup --- pandas/_libs/algos.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 281d2b9a38b2f..4ae134b42b243 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1166,11 +1166,11 @@ def rank_1d( for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = grp_vals_seen - # Look forward to the next value (using the sorting in lexsort_indexer) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. The conditional also needs to handle nan equality - # and the end of iteration + # Look forward to the next value (using the sorting in + # lexsort_indexer) if the value does not equal the current + # value then we need to reset the dups and sum_ranks, knowing + # that a new value is coming up. The conditional also needs + # to handle nan equality and the end of iteration if next_val_diff or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 From 268abd046103eb1615f43ced75cb7fde8a578514 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 22 Mar 2021 18:20:53 -0400 Subject: [PATCH 14/16] Add whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/tests/groupby/test_rank.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 16f76651a65aa..d93fdc57236dc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -502,6 +502,7 @@ Numeric - Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`) - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`) +- Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`) - Conversion diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index d21767db63871..accdfb24e8ed7 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -522,11 +522,11 @@ def test_rank_zero_div(input_key, input_value, output_value): @pytest.mark.parametrize("use_nan", [True, False]) def test_rank_pct_equal_values_on_group_transition(use_nan): # GH#40518 - fill_value = np.nan if use_nan else 2 + fill_value = np.nan if use_nan else 3 df = DataFrame( [ [-1, 1], - [-1, 1], + [-1, 2], [1, fill_value], [-1, fill_value], ], @@ -537,8 +537,8 @@ def test_rank_pct_equal_values_on_group_transition(use_nan): pct=True, ) if use_nan: - expected = Series([1, 1, np.nan, np.nan], name="val") + expected = Series([0.5, 1, np.nan, np.nan], name="val") else: - expected = Series([0.5, 0.5, 1, 1], name="val") + expected = Series([1 / 3, 2 / 3, 1, 1], name="val") tm.assert_series_equal(result, expected) From 8eb56cfb1ce4d64a9dfd7f4f26e7f7bcaf8537d7 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 26 Mar 2021 20:44:49 -0400 Subject: [PATCH 15/16] Use float instead of int for grp_size --- pandas/_libs/algos.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 0c92cd44b1d22..8d1f08bb70825 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -998,13 +998,13 @@ def rank_1d( TiebreakEnumType tiebreak Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0 Py_ssize_t grp_vals_seen=1, grp_na_count=0 - ndarray[int64_t, ndim=1] lexsort_indexer - ndarray[float64_t, ndim=1] grp_sizes, out + ndarray[int64_t, ndim=1] lexsort_indexer, grp_sizes + ndarray[float64_t, ndim=1] out ndarray[rank_t, ndim=1] masked_vals ndarray[uint8_t, ndim=1] mask bint keep_na, at_end, next_val_diff, check_labels, group_changed rank_t nan_fill_val - float64_t grp_size + int64_t grp_size tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -1017,7 +1017,7 @@ def rank_1d( # TODO Cython 3.0: cast won't be necessary (#2992) assert len(labels) == N out = np.empty(N) - grp_sizes = np.ones(N) + grp_sizes = np.ones(N, dtype=np.int64) # If all 0 labels, can short-circuit later label # comparisons From 57e66975def09c000edeb3c4d14712e4b2fe9c84 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 27 Mar 2021 16:00:56 -0400 Subject: [PATCH 16/16] Use intp for lexsort indexer --- pandas/_libs/algos.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 8d1f08bb70825..feeff7e1b2b11 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -998,7 +998,8 @@ def rank_1d( TiebreakEnumType tiebreak Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0 Py_ssize_t grp_vals_seen=1, grp_na_count=0 - ndarray[int64_t, ndim=1] lexsort_indexer, grp_sizes + ndarray[int64_t, ndim=1] grp_sizes + ndarray[intp_t, ndim=1] lexsort_indexer ndarray[float64_t, ndim=1] out ndarray[rank_t, ndim=1] masked_vals ndarray[uint8_t, ndim=1] mask @@ -1074,7 +1075,7 @@ def rank_1d( # each label corresponds to a different group value, # the mask helps you differentiate missing values before # performing sort on the actual values - lexsort_indexer = np.lexsort(order).astype(np.int64, copy=False) + lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False) if not ascending: lexsort_indexer = lexsort_indexer[::-1]