-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF/BUG: use masked algo in groupby cummin and cummax #40651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 40 commits
7cd4dc3
91984dc
b371cc5
69cce96
dd7f324
64680d4
be16f65
9442846
f089175
31409f8
0c05f74
18dcc94
5c60a1f
f0c27ce
2fa80ad
280c7e5
dca28cf
7e2fbe0
0ebb97a
0009dfd
6663832
8247f82
71e1c4f
c6cf9ee
02768ec
836175b
293dc6e
478c6c9
fa45a9a
1632b81
1bb344e
97d9eea
892a92a
a239a68
f98ca35
e7ed12f
a1422ba
482a209
4e7404d
251c02a
3de7e5e
237f86f
a1b0c04
5e1dac4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1277,6 +1277,7 @@ def group_min(groupby_t[:, ::1] out, | |
@cython.wraparound(False) | ||
cdef group_cummin_max(groupby_t[:, ::1] out, | ||
ndarray[groupby_t, ndim=2] values, | ||
uint8_t[:, ::1] mask, | ||
const intp_t[:] labels, | ||
int ngroups, | ||
bint is_datetimelike, | ||
|
@@ -1290,6 +1291,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, | |
Array to store cummin/max in. | ||
values : np.ndarray[groupby_t, ndim=2] | ||
Values to take cummin/max of. | ||
mask : np.ndarray[bool] or None | ||
If not None, indices represent missing values, | ||
otherwise the mask will not be used | ||
labels : np.ndarray[np.intp] | ||
Labels to group by. | ||
ngroups : int | ||
|
@@ -1307,11 +1311,14 @@ cdef group_cummin_max(groupby_t[:, ::1] out, | |
cdef: | ||
Py_ssize_t i, j, N, K, size | ||
groupby_t val, mval | ||
ndarray[groupby_t, ndim=2] accum | ||
groupby_t[:, ::1] accum | ||
intp_t lab | ||
bint val_is_nan, use_mask | ||
|
||
use_mask = mask is not None | ||
|
||
N, K = (<object>values).shape | ||
accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) | ||
accum = np.empty((ngroups, K), dtype=values.dtype) | ||
if groupby_t is int64_t: | ||
accum[:] = -_int64_max if compute_max else _int64_max | ||
elif groupby_t is uint64_t: | ||
|
@@ -1326,11 +1333,29 @@ cdef group_cummin_max(groupby_t[:, ::1] out, | |
if lab < 0: | ||
continue | ||
for j in range(K): | ||
val = values[i, j] | ||
val_is_nan = False | ||
|
||
if use_mask: | ||
if mask[i, j]: | ||
|
||
# `out` does not need to be set since it | ||
# will be masked anyway | ||
val_is_nan = True | ||
else: | ||
|
||
# If using the mask, we can avoid grabbing the | ||
# value unless necessary | ||
val = values[i, j] | ||
|
||
if _treat_as_na(val, is_datetimelike): | ||
out[i, j] = val | ||
# Otherwise, `out` must be set accordingly if the | ||
# value is missing | ||
else: | ||
val = values[i, j] | ||
if _treat_as_na(val, is_datetimelike): | ||
val_is_nan = True | ||
out[i, j] = val | ||
|
||
if not val_is_nan: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does it make sense to implement this as a separate function? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't have a strong opinion about this. The question would be the tradeoff between a bit more complexity/branching vs duplication/increased package size (if we end up adding masked support to a lot more of these grouped algos) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any guess what the impact on package size is? potential duplication might be addressed by making e.g. refactoring L1340-1347 or L 1302-1308 into helper functions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on rough estimate, binaries generated from groupby.pyx take up ~5% of total |
||
mval = accum[lab, j] | ||
if compute_max: | ||
if val > mval: | ||
|
@@ -1347,9 +1372,18 @@ def group_cummin(groupby_t[:, ::1] out, | |
ndarray[groupby_t, ndim=2] values, | ||
const intp_t[:] labels, | ||
int ngroups, | ||
bint is_datetimelike) -> None: | ||
bint is_datetimelike, | ||
uint8_t[:, ::1] mask=None) -> None: | ||
"""See group_cummin_max.__doc__""" | ||
group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=False) | ||
group_cummin_max( | ||
out, | ||
values, | ||
mask, | ||
labels, | ||
ngroups, | ||
is_datetimelike, | ||
compute_max=False | ||
) | ||
|
||
|
||
@cython.boundscheck(False) | ||
|
@@ -1358,6 +1392,15 @@ def group_cummax(groupby_t[:, ::1] out, | |
ndarray[groupby_t, ndim=2] values, | ||
const intp_t[:] labels, | ||
int ngroups, | ||
bint is_datetimelike) -> None: | ||
bint is_datetimelike, | ||
uint8_t[:, ::1] mask=None) -> None: | ||
"""See group_cummin_max.__doc__""" | ||
group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=True) | ||
group_cummin_max( | ||
out, | ||
values, | ||
mask, | ||
labels, | ||
ngroups, | ||
is_datetimelike, | ||
compute_max=True | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -67,6 +67,10 @@ | |
) | ||
|
||
from pandas.core.arrays import ExtensionArray | ||
from pandas.core.arrays.masked import ( | ||
BaseMaskedArray, | ||
BaseMaskedDtype, | ||
) | ||
import pandas.core.common as com | ||
from pandas.core.frame import DataFrame | ||
from pandas.core.generic import NDFrame | ||
|
@@ -122,6 +126,8 @@ def __init__(self, kind: str, how: str): | |
}, | ||
} | ||
|
||
_MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax"} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is a separate variable necessary here? that is just confusing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it makes sense to store this information in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this would be fine, its not clear that this happening in this PR |
||
|
||
_cython_arity = {"ohlc": 4} # OHLC | ||
|
||
# Note: we make this a classmethod and pass kind+how so that caching | ||
|
@@ -257,6 +263,9 @@ def get_out_dtype(self, dtype: np.dtype) -> np.dtype: | |
out_dtype = "object" | ||
return np.dtype(out_dtype) | ||
|
||
def uses_mask(self) -> bool: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is very far from actual use, would remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Think this falls under the umbrella of your comment here #40651 (comment). I liked folding this functionality into There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree having this in |
||
return self.how in self._MASKED_CYTHON_FUNCTIONS | ||
|
||
|
||
class BaseGrouper: | ||
""" | ||
|
@@ -606,9 +615,49 @@ def _ea_wrap_cython_operation( | |
f"function is not implemented for this dtype: {values.dtype}" | ||
) | ||
|
||
@final | ||
def _masked_ea_wrap_cython_operation( | ||
self, | ||
kind: str, | ||
values: BaseMaskedArray, | ||
how: str, | ||
axis: int, | ||
min_count: int = -1, | ||
**kwargs, | ||
) -> BaseMaskedArray: | ||
""" | ||
Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's | ||
and cython algorithms which accept a mask. | ||
""" | ||
orig_values = values | ||
|
||
# Copy to ensure input and result masks don't end up shared | ||
mask = values._mask.copy() | ||
arr = values._data | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if is_integer_dtype(arr.dtype) or is_bool_dtype(arr.dtype): | ||
# IntegerArray or BooleanArray | ||
arr = ensure_int_or_float(arr) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW im planning to kill off this function; for EAs this is always just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for bringing up - realized this whole condition can be simplified since we actually have an ndarray at this point |
||
|
||
res_values = self._cython_operation( | ||
kind, arr, how, axis, min_count, mask=mask, **kwargs | ||
) | ||
dtype = maybe_cast_result_dtype(orig_values.dtype, how) | ||
assert isinstance(dtype, BaseMaskedDtype) | ||
cls = dtype.construct_array_type() | ||
|
||
return cls(res_values.astype(dtype.type, copy=False), mask) | ||
|
||
@final | ||
def _cython_operation( | ||
self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs | ||
self, | ||
kind: str, | ||
values, | ||
how: str, | ||
axis: int, | ||
min_count: int = -1, | ||
mask: np.ndarray | None = None, | ||
**kwargs, | ||
) -> ArrayLike: | ||
""" | ||
Returns the values of a cython operation. | ||
|
@@ -632,10 +681,16 @@ def _cython_operation( | |
# if not raise NotImplementedError | ||
cy_op.disallow_invalid_ops(dtype, is_numeric) | ||
|
||
func_uses_mask = cy_op.uses_mask() | ||
if is_extension_array_dtype(dtype): | ||
return self._ea_wrap_cython_operation( | ||
kind, values, how, axis, min_count, **kwargs | ||
) | ||
if isinstance(values, BaseMaskedArray) and func_uses_mask: | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return self._masked_ea_wrap_cython_operation( | ||
kind, values, how, axis, min_count, **kwargs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i really don't understand all of this code duplication. this is adding huge complexity. pls reduce it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Jeff, did you actually read the previous responses to your similar comment? (https://github.com/pandas-dev/pandas/pull/40651/files#r603319910) Can you then please answer there to the concrete reasons given. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes and its a terrible pattern. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this duplication of code is ridiculous. We have a VERY large codebase. Having this kind of separate logic is amazingling confusing & is humungous tech debt. This is heavily used code and needs to be carefully modified. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand the concern about adding code complexity - my thinking was that if the goal is for nullable types to become the default in pandas, then direct support makes sense. And in that case, nullable types would need to be special-cased somewhere, and I think the separate function is cleaner than interleaving in If direct support for nullable dtypes is not desired, we can just close this. If it is, I'll keep trying to think of ways to achieve this without adding more code, but any suggestions there would be welcome! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Proper support for nullable dtypes is certainly desired (how to add it exactly can of course be discussed), so thanks a lot @mzeitlin11 for your efforts here. AFAIK, it's correct we need some special casing for it somewhere (that's the whole point of this PR is to add special handling for it). @jreback please try to stay constructive (eg answer to our arguments or provide concrete suggestions on where you would put it / how you would do it differently) and please mind your language (there is no need to call the approach taken by a contributor "terrible"). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This decision has not been made.
Agreed.
as Joris correctly pointed out, that is not viable ATM. I think a lot of this dispatch logic eventually belongs in WrappedCythonOp (which i've been vaguely planning on doing next time there aren't any open PRs touching this code), at which point we can reconsider flattening this
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jbrockmendel if you plan further refactoring of this code, I'm happy to just mothball this pr for now. The real benefit won't come in until more groupby algos allow a mask on this path anyway, so not worth adding now if it's just going to cause more pain in future refactoring. I also like the idea of approach 5 instead of this - could start looking into that if you think it's a promising direction. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
From today's call, I think the plan is to move forward with this first.
Long-term I think this is the right way to go to get the general case right, so I'd encourage you if you're interested in trying to implement this on the EA- separate PR(s). |
||
) | ||
else: | ||
return self._ea_wrap_cython_operation( | ||
kind, values, how, axis, min_count, **kwargs | ||
) | ||
|
||
elif values.ndim == 1: | ||
# expand to 2d, dispatch, then squeeze if appropriate | ||
|
@@ -646,6 +701,7 @@ def _cython_operation( | |
how=how, | ||
axis=1, | ||
min_count=min_count, | ||
mask=mask, | ||
**kwargs, | ||
) | ||
if res.shape[0] == 1: | ||
|
@@ -673,6 +729,9 @@ def _cython_operation( | |
assert axis == 1 | ||
values = values.T | ||
|
||
if mask is not None: | ||
mask = mask.reshape(values.shape, order="C") | ||
|
||
out_shape = cy_op.get_output_shape(ngroups, values) | ||
func, values = cy_op.get_cython_func_and_vals(values, is_numeric) | ||
out_dtype = cy_op.get_out_dtype(values.dtype) | ||
|
@@ -693,7 +752,18 @@ def _cython_operation( | |
func(result, counts, values, comp_ids, min_count) | ||
elif kind == "transform": | ||
# TODO: min_count | ||
func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) | ||
if func_uses_mask: | ||
func( | ||
result, | ||
values, | ||
comp_ids, | ||
ngroups, | ||
is_datetimelike, | ||
mask=mask, | ||
**kwargs, | ||
) | ||
else: | ||
func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) | ||
|
||
if kind == "aggregate": | ||
# i.e. counts is defined. Locations where count<min_count | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this costly? worth using setup_cache?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks to take ~0.25s. So might be worth caching, but appears
setup_cache
can't be parameterized, so would have to ugly up the benchmark a bit.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we just make N // 10
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep have shrunk benchmark