-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: reduce overhead in groupby _cython_operation #40317
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
a392dd0
c728a15
c9c764e
c738c64
d00aea9
606c4c2
27f27d1
7414ce7
897b989
cf0e90a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,7 +31,7 @@ | |
import pandas._libs.groupby as libgroupby | ||
import pandas._libs.reduction as libreduction | ||
from pandas._typing import ( | ||
ArrayLike, | ||
DtypeObj, | ||
F, | ||
FrameOrSeries, | ||
Shape, | ||
|
@@ -46,7 +46,6 @@ | |
maybe_downcast_to_dtype, | ||
) | ||
from pandas.core.dtypes.common import ( | ||
ensure_float, | ||
ensure_float64, | ||
ensure_int64, | ||
ensure_int_or_float, | ||
|
@@ -490,7 +489,9 @@ def _get_cython_func_and_vals( | |
return func, values | ||
|
||
@final | ||
def _disallow_invalid_ops(self, values: ArrayLike, how: str): | ||
def _disallow_invalid_ops( | ||
self, dtype: DtypeObj, how: str, is_numeric: bool = False | ||
): | ||
""" | ||
Check if we can do this operation with our cython functions. | ||
|
||
|
@@ -500,7 +501,9 @@ def _disallow_invalid_ops(self, values: ArrayLike, how: str): | |
This is either not a valid function for this dtype, or | ||
valid but not implemented in cython. | ||
""" | ||
dtype = values.dtype | ||
if is_numeric: | ||
# never an invalid op for those dtypes, so return early as fastpath | ||
return | ||
|
||
if is_categorical_dtype(dtype) or is_sparse(dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we're micro-optimizing, these can be replaced with isinstance(dtype, FooDtype) checks There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was planning to do a follow-up PR with several other dtype check optimizations, so will include it there |
||
# categoricals are only 1d, so we | ||
|
@@ -594,34 +597,36 @@ def _cython_operation( | |
# as we can have 1D ExtensionArrays that we need to treat as 2D | ||
assert axis == 1, axis | ||
|
||
dtype = values.dtype | ||
is_numeric = is_numeric_dtype(dtype) | ||
|
||
# can we do this operation with our cython functions | ||
# if not raise NotImplementedError | ||
self._disallow_invalid_ops(values, how) | ||
self._disallow_invalid_ops(dtype, how, is_numeric) | ||
|
||
if is_extension_array_dtype(values.dtype): | ||
if is_extension_array_dtype(dtype): | ||
# error: Incompatible return value type (got "Tuple[ndarray, | ||
# Optional[List[str]]]", expected "ndarray") | ||
return self._ea_wrap_cython_operation( # type: ignore[return-value] | ||
kind, values, how, axis, min_count, **kwargs | ||
) | ||
|
||
is_datetimelike = needs_i8_conversion(values.dtype) | ||
is_numeric = is_numeric_dtype(values.dtype) | ||
is_datetimelike = needs_i8_conversion(dtype) | ||
|
||
if is_datetimelike: | ||
values = values.view("int64") | ||
is_numeric = True | ||
elif is_bool_dtype(values.dtype): | ||
elif is_bool_dtype(dtype): | ||
values = ensure_int_or_float(values) | ||
elif is_integer_dtype(values): | ||
elif is_integer_dtype(dtype): | ||
# we use iNaT for the missing value on ints | ||
# so pre-convert to guard this condition | ||
if (values == iNaT).any(): | ||
values = ensure_float64(values) | ||
else: | ||
values = ensure_int_or_float(values) | ||
elif is_numeric and not is_complex_dtype(values): | ||
values = ensure_float64(ensure_float(values)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
elif is_numeric and not is_complex_dtype(dtype): | ||
values = ensure_float64(values) | ||
else: | ||
values = values.astype(object) | ||
|
||
|
@@ -656,20 +661,18 @@ def _cython_operation( | |
codes, _, _ = self.group_info | ||
|
||
if kind == "aggregate": | ||
result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) | ||
result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) | ||
counts = np.zeros(self.ngroups, dtype=np.int64) | ||
result = self._aggregate(result, counts, values, codes, func, min_count) | ||
elif kind == "transform": | ||
result = maybe_fill( | ||
np.empty(values.shape, dtype=out_dtype), fill_value=np.nan | ||
) | ||
result = maybe_fill(np.empty(values.shape, dtype=out_dtype)) | ||
|
||
# TODO: min_count | ||
result = self._transform( | ||
result, values, codes, func, is_datetimelike, **kwargs | ||
) | ||
|
||
if is_integer_dtype(result) and not is_datetimelike: | ||
if is_integer_dtype(result.dtype) and not is_datetimelike: | ||
mask = result == iNaT | ||
if mask.any(): | ||
result = result.astype("float64") | ||
|
@@ -689,9 +692,7 @@ def _cython_operation( | |
# e.g. if we are int64 and need to restore to datetime64/timedelta64 | ||
# "rank" is the only member of cython_cast_blocklist we get here | ||
dtype = maybe_cast_result_dtype(orig_values.dtype, how) | ||
# error: Argument 2 to "maybe_downcast_to_dtype" has incompatible type | ||
# "Union[dtype[Any], ExtensionDtype]"; expected "Union[str, dtype[Any]]" | ||
result = maybe_downcast_to_dtype(result, dtype) # type: ignore[arg-type] | ||
result = maybe_downcast_to_dtype(result, dtype) | ||
|
||
return result | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe_fill
is only used in_cython_operation
, and thus always used with an ndarray and with np.nan as fill value, so therefore simplified this function a bit, based on those assumptions.