Skip to content

PERF: reduce overhead in groupby _cython_operation #40317

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
8 changes: 4 additions & 4 deletions pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,12 +556,12 @@ def infer_fill_value(val):
return np.nan


def maybe_fill(arr, fill_value=np.nan):
def maybe_fill(arr: np.ndarray) -> np.ndarray:
"""
if we have a compatible fill_value and arr dtype, then fill
Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype.
"""
if isna_compat(arr, fill_value):
arr.fill(fill_value)
if arr.dtype.kind not in ("u", "i", "b"):
arr.fill(np.nan)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe_fill is only used in _cython_operation, and thus always used with an ndarray and with np.nan as fill value, so therefore simplified this function a bit, based on those assumptions.

return arr


Expand Down
41 changes: 21 additions & 20 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import pandas._libs.groupby as libgroupby
import pandas._libs.reduction as libreduction
from pandas._typing import (
ArrayLike,
DtypeObj,
F,
FrameOrSeries,
Shape,
Expand All @@ -46,7 +46,6 @@
maybe_downcast_to_dtype,
)
from pandas.core.dtypes.common import (
ensure_float,
ensure_float64,
ensure_int64,
ensure_int_or_float,
Expand Down Expand Up @@ -490,7 +489,9 @@ def _get_cython_func_and_vals(
return func, values

@final
def _disallow_invalid_ops(self, values: ArrayLike, how: str):
def _disallow_invalid_ops(
self, dtype: DtypeObj, how: str, is_numeric: bool = False
):
"""
Check if we can do this operation with our cython functions.

Expand All @@ -500,7 +501,9 @@ def _disallow_invalid_ops(self, values: ArrayLike, how: str):
This is either not a valid function for this dtype, or
valid but not implemented in cython.
"""
dtype = values.dtype
if is_numeric:
# never an invalid op for those dtypes, so return early as fastpath
return

if is_categorical_dtype(dtype) or is_sparse(dtype):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we're micro-optimizing, these can be replaced with isinstance(dtype, FooDtype) checks

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was planning to do a follow-up PR with several other dtype check optimizations, so will include it there

# categoricals are only 1d, so we
Expand Down Expand Up @@ -594,34 +597,36 @@ def _cython_operation(
# as we can have 1D ExtensionArrays that we need to treat as 2D
assert axis == 1, axis

dtype = values.dtype
is_numeric = is_numeric_dtype(dtype)

# can we do this operation with our cython functions
# if not raise NotImplementedError
self._disallow_invalid_ops(values, how)
self._disallow_invalid_ops(dtype, how, is_numeric)

if is_extension_array_dtype(values.dtype):
if is_extension_array_dtype(dtype):
# error: Incompatible return value type (got "Tuple[ndarray,
# Optional[List[str]]]", expected "ndarray")
return self._ea_wrap_cython_operation( # type: ignore[return-value]
kind, values, how, axis, min_count, **kwargs
)

is_datetimelike = needs_i8_conversion(values.dtype)
is_numeric = is_numeric_dtype(values.dtype)
is_datetimelike = needs_i8_conversion(dtype)

if is_datetimelike:
values = values.view("int64")
is_numeric = True
elif is_bool_dtype(values.dtype):
elif is_bool_dtype(dtype):
values = ensure_int_or_float(values)
elif is_integer_dtype(values):
elif is_integer_dtype(dtype):
# we use iNaT for the missing value on ints
# so pre-convert to guard this condition
if (values == iNaT).any():
values = ensure_float64(values)
else:
values = ensure_int_or_float(values)
elif is_numeric and not is_complex_dtype(values):
values = ensure_float64(ensure_float(values))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ensure_float was needed before to ensure that a nullable float EA was converted to the float ndarray. But now EAs already take a different code path above.

elif is_numeric and not is_complex_dtype(dtype):
values = ensure_float64(values)
else:
values = values.astype(object)

Expand Down Expand Up @@ -656,20 +661,18 @@ def _cython_operation(
codes, _, _ = self.group_info

if kind == "aggregate":
result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan)
result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
counts = np.zeros(self.ngroups, dtype=np.int64)
result = self._aggregate(result, counts, values, codes, func, min_count)
elif kind == "transform":
result = maybe_fill(
np.empty(values.shape, dtype=out_dtype), fill_value=np.nan
)
result = maybe_fill(np.empty(values.shape, dtype=out_dtype))

# TODO: min_count
result = self._transform(
result, values, codes, func, is_datetimelike, **kwargs
)

if is_integer_dtype(result) and not is_datetimelike:
if is_integer_dtype(result.dtype) and not is_datetimelike:
mask = result == iNaT
if mask.any():
result = result.astype("float64")
Expand All @@ -689,9 +692,7 @@ def _cython_operation(
# e.g. if we are int64 and need to restore to datetime64/timedelta64
# "rank" is the only member of cython_cast_blocklist we get here
dtype = maybe_cast_result_dtype(orig_values.dtype, how)
# error: Argument 2 to "maybe_downcast_to_dtype" has incompatible type
# "Union[dtype[Any], ExtensionDtype]"; expected "Union[str, dtype[Any]]"
result = maybe_downcast_to_dtype(result, dtype) # type: ignore[arg-type]
result = maybe_downcast_to_dtype(result, dtype)

return result

Expand Down