diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 20489654d7700..0fd9075b91866 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -23,7 +23,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + groupby as libgroupby, + lib, +) from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -43,6 +46,7 @@ is_datetime64_dtype, is_dtype_equal, is_list_like, + is_object_dtype, is_scalar, is_timedelta64_dtype, pandas_dtype, @@ -1722,6 +1726,76 @@ def map(self, mapper, na_action=None): """ return map_array(self, mapper, na_action=na_action) + # ------------------------------------------------------------------------ + # GroupBy Methods + + def _groupby_any_all( + self, + *, + ngroups: int, + ids: npt.NDArray[np.intp], + val_test: Literal["any", "all"], + skipna: bool, + ): + ncols = 1 if self.ndim == 1 else self.shape[0] + result = np.zeros((ngroups, ncols), dtype=np.int8) + + obj = self + if is_object_dtype(self.dtype) and skipna: + # GH#37501: don't raise on pd.NA when skipna=True + self_mask = self.isna() + if self_mask.any(): + # mask on original values computed separately + obj = obj.copy() + obj[self_mask] = True + + vals = obj.astype(bool, copy=False).view(np.int8) + mask = np.asarray(self.isna()).view(np.uint8) + + # Call func to modify result in place + libgroupby.group_any_all( + out=result, + labels=ids, + values=np.atleast_2d(vals).T, + mask=np.atleast_2d(mask).T, + val_test=val_test, + skipna=skipna, + nullable=False, + ) + + if self.ndim == 1: + assert result.shape[1] == 1, result.shape + result = result[:, 0] + + result = result.astype(bool, copy=False) + return result.T + + def _groupby_std(self, *, ngroups: int, ids: npt.NDArray[np.intp], ddof: int): + cython_dtype = np.dtype(np.float64) + + ncols = 1 if self.ndim == 1 else self.shape[0] + + result = np.zeros((ngroups, ncols), dtype=cython_dtype) + counts = np.zeros(ngroups, dtype=np.int64) + + vals = self.astype(cython_dtype, copy=False) + + # Call func to modify result in place + libgroupby.group_var( + out=result, + labels=ids, + values=np.atleast_2d(vals).T, + counts=counts, + ddof=ddof, + ) + + if self.ndim == 1: + assert result.shape[1] == 1, result.shape + result = result[:, 0] + + result = np.sqrt(result) + return result.T + class ExtensionArraySupportsAnyAll(ExtensionArray): def any(self, *, skipna: bool = True) -> bool: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index eb5a8a52ab6f5..e3d0df408abd4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -24,6 +24,7 @@ from pandas._libs import ( algos, + groupby as libgroupby, lib, ) from pandas._libs.arrays import NDArrayBacked @@ -2104,6 +2105,39 @@ def factorize( # FIXME: shouldn't get here; we are ignoring sort return super().factorize(use_na_sentinel=use_na_sentinel) + def _groupby_std(self, *, ngroups: int, ids: npt.NDArray[np.intp], ddof: int): + cython_dtype = np.dtype(np.float64) + + ncols = 1 if self.ndim == 1 else self.shape[0] + + result = np.zeros((ngroups, ncols), dtype=cython_dtype) + counts = np.zeros(ngroups, dtype=np.int64) + + vals = self.view("i8").astype(cython_dtype, copy=False) + + # Call func to modify result in place + libgroupby.group_var( + out=result, + labels=ids, + values=np.atleast_2d(vals).T, + counts=counts, + ddof=ddof, + is_datetimelike=True, + ) + + if self.ndim == 1: + assert result.shape[1] == 1, result.shape + result = result[:, 0] + + result = np.sqrt(result) + + with warnings.catch_warnings(): + # suppress "RuntimeWarning: invalid value encountered in cast" + warnings.filterwarnings("ignore") + result = result.astype(np.int64, copy=False) + result = result.view(f"m8[{self.unit}]") + return result.T + # ------------------------------------------------------------------- # Shared Constructor Helpers diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8591cf2d3a4c5..c34b917d79e31 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -13,6 +13,7 @@ import numpy as np from pandas._libs import ( + groupby as libgroupby, lib, missing as libmissing, ) @@ -1382,3 +1383,64 @@ def _accumulate( data, mask = op(data, mask, skipna=skipna, **kwargs) return type(self)(data, mask, copy=False) + + # ------------------------------------------------------------------------ + # GroupBy Methods + + def _groupby_any_all( + self, + *, + ngroups: int, + ids: npt.NDArray[np.intp], + val_test: Literal["any", "all"], + skipna: bool, + ): + from pandas.core.arrays import BooleanArray + + result = np.zeros(ngroups, dtype=np.int8).reshape(-1, 1) + vals = self._data.astype(bool, copy=False).view(np.int8).reshape(-1, 1) + mask = self.isna().view(np.uint8).reshape(-1, 1) + + # Call func to modify result in place + libgroupby.group_any_all( + out=result, + labels=ids, + values=vals, + mask=mask, + val_test=val_test, + skipna=skipna, + nullable=True, + ) + + assert result.shape[1] == 1, result.shape + result = result[:, 0] + + return BooleanArray(result.astype(bool, copy=False), result == -1) + + def _groupby_std(self, *, ngroups: int, ids: npt.NDArray[np.intp], ddof: int): + from pandas.core.arrays import FloatingArray + + result = np.zeros(ngroups, dtype=np.float64).reshape(-1, 1) + counts = np.zeros(ngroups, dtype=np.int64) + vals = self._data.astype(np.float64, copy=False).reshape(-1, 1) + mask = self.isna().view(np.uint8).reshape(-1, 1) + result_mask = np.zeros(result.shape, dtype=np.bool_) + + # Call func to modify result in place + libgroupby.group_var( + out=result, + labels=ids, + values=vals, + mask=mask, + counts=counts, + result_mask=result_mask, + ddof=ddof, + ) + + assert result.shape[1] == 1, result.shape + result = result[:, 0] + + assert result_mask.shape[1] == 1, result_mask.shape + result_mask = result_mask[:, 0] + + return FloatingArray(np.sqrt(result), result_mask.view(np.bool_)) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 96f39bb99e544..e344bb12329bc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -93,12 +93,10 @@ class providing the base-class of operations. from pandas.core._numba import executor from pandas.core.arrays import ( BaseMaskedArray, - BooleanArray, Categorical, - DatetimeArray, ExtensionArray, FloatingArray, - TimedeltaArray, + PandasArray, ) from pandas.core.base import ( PandasObject, @@ -1641,36 +1639,8 @@ def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool): """ Shared func to call any / all Cython GroupBy implementations. """ - - def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]: - if is_object_dtype(vals.dtype) and skipna: - # GH#37501: don't raise on pd.NA when skipna=True - mask = isna(vals) - if mask.any(): - # mask on original values computed separately - vals = vals.copy() - vals[mask] = True - elif isinstance(vals, BaseMaskedArray): - vals = vals._data - vals = vals.astype(bool, copy=False) - return vals.view(np.int8), bool - - def result_to_bool( - result: np.ndarray, - inference: type, - nullable: bool = False, - ) -> ArrayLike: - if nullable: - return BooleanArray(result.astype(bool, copy=False), result == -1) - else: - return result.astype(inference, copy=False) - return self._get_cythonized_result( - libgroupby.group_any_all, numeric_only=False, - cython_dtype=np.dtype(np.int8), - pre_processing=objs_to_bool, - post_processing=result_to_bool, val_test=val_test, skipna=skipna, ) @@ -1933,28 +1903,8 @@ def std( return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof)) else: - - def _preprocessing(values): - if isinstance(values, BaseMaskedArray): - return values._data, None - return values, None - - def _postprocessing( - vals, inference, nullable: bool = False, result_mask=None - ) -> ArrayLike: - if nullable: - if result_mask.ndim == 2: - result_mask = result_mask[:, 0] - return FloatingArray(np.sqrt(vals), result_mask.view(np.bool_)) - return np.sqrt(vals) - result = self._get_cythonized_result( - libgroupby.group_var, - cython_dtype=np.dtype(np.float64), numeric_only=numeric_only, - needs_counts=True, - pre_processing=_preprocessing, - post_processing=_postprocessing, ddof=ddof, how="std", ) @@ -3627,12 +3577,7 @@ def cummax( @final def _get_cythonized_result( self, - base_func: Callable, - cython_dtype: np.dtype, numeric_only: bool = False, - needs_counts: bool = False, - pre_processing=None, - post_processing=None, how: str = "any_all", **kwargs, ): @@ -3641,27 +3586,8 @@ def _get_cythonized_result( Parameters ---------- - base_func : callable, Cythonized function to be called - cython_dtype : np.dtype - Type of the array that will be modified by the Cython call. numeric_only : bool, default False Whether only numeric datatypes should be computed - needs_counts : bool, default False - Whether the counts should be a part of the Cython call - pre_processing : function, default None - Function to be applied to `values` prior to passing to Cython. - Function should return a tuple where the first element is the - values to be passed to Cython and the second element is an optional - type which the values should be converted to after being returned - by the Cython operation. This function is also responsible for - raising a TypeError if the values have an invalid type. Raises - if `needs_values` is False. - post_processing : function, default None - Function to be applied to result of Cython function. Should accept - an array of values as the first argument and type inferences as its - second argument, i.e. the signature should be - (ndarray, Type). If `needs_nullable=True`, a third argument should be - `nullable`, to allow for processing specific to nullable values. how : str, default any_all Determines if any/all cython interface or std interface is used. **kwargs : dict @@ -3671,87 +3597,29 @@ def _get_cythonized_result( ------- `Series` or `DataFrame` with filled values """ - if post_processing and not callable(post_processing): - raise ValueError("'post_processing' must be a callable!") - if pre_processing and not callable(pre_processing): - raise ValueError("'pre_processing' must be a callable!") - grouper = self.grouper ids, _, ngroups = grouper.group_info - base_func = partial(base_func, labels=ids) - def blk_func(values: ArrayLike) -> ArrayLike: - values = values.T - ncols = 1 if values.ndim == 1 else values.shape[1] - - result: ArrayLike - result = np.zeros(ngroups * ncols, dtype=cython_dtype) - result = result.reshape((ngroups, ncols)) - - func = partial(base_func, out=result) - - inferences = None - - if needs_counts: - counts = np.zeros(ngroups, dtype=np.int64) - func = partial(func, counts=counts) - - is_datetimelike = values.dtype.kind in ["m", "M"] - vals = values - if is_datetimelike and how == "std": - vals = vals.view("i8") - if pre_processing: - vals, inferences = pre_processing(vals) - - vals = vals.astype(cython_dtype, copy=False) - if vals.ndim == 1: - vals = vals.reshape((-1, 1)) - func = partial(func, values=vals) - - if how != "std" or isinstance(values, BaseMaskedArray): - mask = isna(values).view(np.uint8) - if mask.ndim == 1: - mask = mask.reshape(-1, 1) - func = partial(func, mask=mask) - - if how != "std": - is_nullable = isinstance(values, BaseMaskedArray) - func = partial(func, nullable=is_nullable) - - elif isinstance(values, BaseMaskedArray): - result_mask = np.zeros(result.shape, dtype=np.bool_) - func = partial(func, result_mask=result_mask) - - # Call func to modify result in place - if how == "std": - func(**kwargs, is_datetimelike=is_datetimelike) - else: - func(**kwargs) - - if values.ndim == 1: - assert result.shape[1] == 1, result.shape - result = result[:, 0] - - if post_processing: - pp_kwargs: dict[str, bool | np.ndarray] = {} - pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray) - if how == "std" and pp_kwargs["nullable"]: - pp_kwargs["result_mask"] = result_mask - - result = post_processing(result, inferences, **pp_kwargs) - - if how == "std" and is_datetimelike: - values = cast("DatetimeArray | TimedeltaArray", values) - unit = values.unit - with warnings.catch_warnings(): - # suppress "RuntimeWarning: invalid value encountered in cast" - warnings.filterwarnings("ignore") - result = result.astype(np.int64, copy=False) - result = result.view(f"m8[{unit}]") + if isinstance(values, ExtensionArray): + if how == "std": + return values._groupby_std( + ngroups=ngroups, ids=ids, ddof=kwargs["ddof"] + ) + elif how == "any_all": + return values._groupby_any_all( + ngroups=ngroups, + ids=ids, + skipna=kwargs["skipna"], + val_test=kwargs["val_test"], + ) + else: + raise NotImplementedError - return result.T + arr = PandasArray(values) + result = blk_func(arr) + return result # Operate block-wise instead of column-by-column mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)