diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dd8487e5702d0..12ecf5d54b356 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -95,7 +95,6 @@ class providing the base-class of operations. from pandas.core._numba import executor from pandas.core.arrays import ( BaseMaskedArray, - BooleanArray, Categorical, ExtensionArray, FloatingArray, @@ -1545,6 +1544,8 @@ def array_func(values: ArrayLike) -> ArrayLike: # and non-applicable functions # try to python agg # TODO: shouldn't min_count matter? + if how in ["any", "all"]: + raise # TODO: re-raise as TypeError? result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) return result @@ -1694,45 +1695,6 @@ def _obj_1d_constructor(self) -> Callable: assert isinstance(self.obj, Series) return self.obj._constructor - @final - def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool): - """ - Shared func to call any / all Cython GroupBy implementations. - """ - - def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]: - if is_object_dtype(vals.dtype) and skipna: - # GH#37501: don't raise on pd.NA when skipna=True - mask = isna(vals) - if mask.any(): - # mask on original values computed separately - vals = vals.copy() - vals[mask] = True - elif isinstance(vals, BaseMaskedArray): - vals = vals._data - vals = vals.astype(bool, copy=False) - return vals.view(np.int8), bool - - def result_to_bool( - result: np.ndarray, - inference: type, - result_mask, - ) -> ArrayLike: - if result_mask is not None: - return BooleanArray(result.astype(bool, copy=False), result_mask) - else: - return result.astype(inference, copy=False) - - return self._get_cythonized_result( - libgroupby.group_any_all, - numeric_only=False, - cython_dtype=np.dtype(np.int8), - pre_processing=objs_to_bool, - post_processing=result_to_bool, - val_test=val_test, - skipna=skipna, - ) - @final @Substitution(name="groupby") @Appender(_common_see_also) @@ -1751,7 +1713,11 @@ def any(self, skipna: bool = True): DataFrame or Series of boolean values, where a value is True if any element is True within its respective group, False otherwise. """ - return self._bool_agg("any", skipna) + return self._cython_agg_general( + "any", + alt=lambda x: Series(x).any(skipna=skipna), + skipna=skipna, + ) @final @Substitution(name="groupby") @@ -1771,7 +1737,11 @@ def all(self, skipna: bool = True): DataFrame or Series of boolean values, where a value is True if all elements are True within its respective group, False otherwise. """ - return self._bool_agg("all", skipna) + return self._cython_agg_general( + "all", + alt=lambda x: Series(x).all(skipna=skipna), + skipna=skipna, + ) @final @Substitution(name="groupby") @@ -3702,116 +3672,6 @@ def cummax( "cummax", numeric_only=numeric_only, skipna=skipna ) - @final - def _get_cythonized_result( - self, - base_func: Callable, - cython_dtype: np.dtype, - numeric_only: bool = False, - pre_processing=None, - post_processing=None, - how: str = "any_all", - **kwargs, - ): - """ - Get result for Cythonized functions. - - Parameters - ---------- - base_func : callable, Cythonized function to be called - cython_dtype : np.dtype - Type of the array that will be modified by the Cython call. - numeric_only : bool, default False - Whether only numeric datatypes should be computed - pre_processing : function, default None - Function to be applied to `values` prior to passing to Cython. - Function should return a tuple where the first element is the - values to be passed to Cython and the second element is an optional - type which the values should be converted to after being returned - by the Cython operation. This function is also responsible for - raising a TypeError if the values have an invalid type. Raises - if `needs_values` is False. - post_processing : function, default None - Function to be applied to result of Cython function. Should accept - an array of values as the first argument and type inferences as its - second argument, i.e. the signature should be - (ndarray, Type). If `needs_nullable=True`, a third argument should be - `nullable`, to allow for processing specific to nullable values. - how : str, default any_all - Determines if any/all cython interface or std interface is used. - **kwargs : dict - Extra arguments to be passed back to Cython funcs - - Returns - ------- - `Series` or `DataFrame` with filled values - """ - if post_processing and not callable(post_processing): - raise ValueError("'post_processing' must be a callable!") - if pre_processing and not callable(pre_processing): - raise ValueError("'pre_processing' must be a callable!") - - grouper = self.grouper - - ids, _, ngroups = grouper.group_info - - base_func = partial(base_func, labels=ids) - - def blk_func(values: ArrayLike) -> ArrayLike: - values = values.T - ncols = 1 if values.ndim == 1 else values.shape[1] - - result: ArrayLike - result = np.zeros(ngroups * ncols, dtype=cython_dtype) - result = result.reshape((ngroups, ncols)) - - func = partial(base_func, out=result) - - inferences = None - - vals = values - if pre_processing: - vals, inferences = pre_processing(vals) - - vals = vals.astype(cython_dtype, copy=False) - if vals.ndim == 1: - vals = vals.reshape((-1, 1)) - func = partial(func, values=vals) - - mask = isna(values).view(np.uint8) - if mask.ndim == 1: - mask = mask.reshape(-1, 1) - func = partial(func, mask=mask) - - result_mask = None - if isinstance(values, BaseMaskedArray): - result_mask = np.zeros(result.shape, dtype=np.bool_) - - func = partial(func, result_mask=result_mask) - - # Call func to modify result in place - func(**kwargs) - - if values.ndim == 1: - assert result.shape[1] == 1, result.shape - result = result[:, 0] - if result_mask is not None: - assert result_mask.shape[1] == 1, result_mask.shape - result_mask = result_mask[:, 0] - - if post_processing: - result = post_processing(result, inferences, result_mask=result_mask) - - return result.T - - # Operate block-wise instead of column-by-column - mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) - - res_mgr = mgr.grouped_reduce(blk_func) - - out = self._wrap_agged_manager(res_mgr) - return self._wrap_aggregated_output(out) - @final @Substitution(name="groupby") def shift( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e4ae25ff26207..eb6dd3dbfe7c5 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -115,7 +115,9 @@ class WrappedCythonOp: # Functions for which we do _not_ attempt to cast the cython result # back to the original dtype. - cast_blocklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) + cast_blocklist = frozenset( + ["any", "all", "rank", "count", "size", "idxmin", "idxmax"] + ) def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: self.kind = kind @@ -124,6 +126,8 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: _CYTHON_FUNCTIONS: dict[str, dict] = { "aggregate": { + "any": functools.partial(libgroupby.group_any_all, val_test="any"), + "all": functools.partial(libgroupby.group_any_all, val_test="all"), "sum": "group_sum", "prod": "group_prod", "min": "group_min", @@ -253,7 +257,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): # don't go down a group-by-group path, since in the empty-groups # case that would fail to raise raise TypeError(f"Cannot perform {how} with non-ordered Categorical") - if how not in ["rank"]: + if how not in ["rank", "any", "all"]: # only "rank" is implemented in cython raise NotImplementedError(f"{dtype} dtype not supported") @@ -352,10 +356,13 @@ def _ea_wrap_cython_operation( ) elif isinstance(values, Categorical): - assert self.how == "rank" # the only one implemented ATM - assert values.ordered # checked earlier + assert self.how in ["rank", "any", "all"] mask = values.isna() - npvalues = values._ndarray + if self.how == "rank": + assert values.ordered # checked earlier + npvalues = values._ndarray + else: + npvalues = values.astype(bool) res_values = self._cython_op_ndim_compat( npvalues, @@ -546,6 +553,19 @@ def _call_cython_op( if values.dtype == "float16": values = values.astype(np.float32) + if self.how in ["any", "all"]: + if mask is None: + mask = isna(values) + if dtype == object: + if kwargs["skipna"]: + # GH#37501: don't raise on pd.NA when skipna=True + if mask.any(): + # mask on original values computed separately + values = values.copy() + values[mask] = True + values = values.astype(bool, copy=False).view(np.int8) + is_numeric = True + values = values.T if mask is not None: mask = mask.T @@ -584,6 +604,16 @@ def _call_cython_op( result_mask=result_mask, **kwargs, ) + elif self.how in ["any", "all"]: + func( + out=result, + values=values, + labels=comp_ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + result = result.astype(bool, copy=False) else: raise NotImplementedError(f"{self.how} is not implemented") else: