-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
PERF: GroupBy.any/all operate blockwise instead of column-wise #42841
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
02e5318
af4815b
1d41741
ef81c0a
70c4b77
755a9c0
2ccaf19
67cd22e
6b73254
56940e7
f5ee7f6
8cddf6d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -388,10 +388,10 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels, | |
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def group_any_all(int8_t[::1] out, | ||
const int8_t[::1] values, | ||
const intp_t[:] labels, | ||
const uint8_t[::1] mask, | ||
def group_any_all(int8_t[:, ::1] out, | ||
const int8_t[:, :] values, | ||
const intp_t[::1] labels, | ||
const uint8_t[:, :] mask, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See question below about if |
||
str val_test, | ||
bint skipna, | ||
bint nullable) -> None: | ||
|
@@ -426,9 +426,9 @@ def group_any_all(int8_t[::1] out, | |
-1 to signify a masked position in the case of a nullable input. | ||
""" | ||
cdef: | ||
Py_ssize_t i, N = len(labels) | ||
Py_ssize_t i, j, N = len(labels), K = out.shape[1] | ||
intp_t lab | ||
int8_t flag_val | ||
int8_t flag_val, val | ||
|
||
if val_test == 'all': | ||
# Because the 'all' value of an empty iterable in Python is True we can | ||
|
@@ -443,26 +443,34 @@ def group_any_all(int8_t[::1] out, | |
else: | ||
raise ValueError("'bool_func' must be either 'any' or 'all'!") | ||
|
||
if mask is not None and mask.shape[1] != 1: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can mask be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i was wrong on this, will add suggested |
||
raise NotImplementedError("group_any_all assumes mask.shape[1] == 1") | ||
|
||
out[:] = 1 - flag_val | ||
|
||
with nogil: | ||
for i in range(N): | ||
lab = labels[i] | ||
if lab < 0 or (skipna and mask[i]): | ||
if lab < 0 or (skipna and mask[i, 0]): | ||
# NB: mask[i, 0] here is assuming mask.shape[1] == 1 | ||
continue | ||
|
||
if nullable and mask[i]: | ||
# Set the position as masked if `out[lab] != flag_val`, which | ||
# would indicate True/False has not yet been seen for any/all, | ||
# so by Kleene logic the result is currently unknown | ||
if out[lab] != flag_val: | ||
out[lab] = -1 | ||
continue | ||
for j in range(K): | ||
|
||
if nullable and mask[i, j]: | ||
# Set the position as masked if `out[lab] != flag_val`, which | ||
# would indicate True/False has not yet been seen for any/all, | ||
# so by Kleene logic the result is currently unknown | ||
if out[lab, j] != flag_val: | ||
out[lab, j] = -1 | ||
continue | ||
|
||
val = values[i, j] | ||
|
||
# If True and 'any' or False and 'all', the result is | ||
# already determined | ||
if values[i] == flag_val: | ||
out[lab] = flag_val | ||
# If True and 'any' or False and 'all', the result is | ||
# already determined | ||
if val == flag_val: | ||
out[lab, j] = flag_val | ||
|
||
|
||
# ---------------------------------------------------------------------- | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we have a contiguity guarantee for
labels
here, would that also be the case for other groupby algos? Just wondering because I think the others don't specifylabels
contiguity, so might be able to gain small speedup by doing that.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think thats the case, yes. merits double-checking, especially since some builds are failing