From 777c35c83e7ad35fa8f665497515be633c109b45 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Feb 2023 11:08:27 -0800 Subject: [PATCH 1/6] REF: implement groupby_any_all, groupby_std --- pandas/core/arrays/base.py | 181 ++++++++++++++++++++++++++++++++- pandas/core/groupby/groupby.py | 18 +++- 2 files changed, 195 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c261a41e1e77e..0363a749572d7 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -8,6 +8,7 @@ """ from __future__ import annotations +from functools import partial import operator from typing import ( TYPE_CHECKING, @@ -24,7 +25,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + groupby as libgroupby, + lib, +) from pandas._typing import ( ArrayLike, AstypeArg, @@ -58,6 +62,7 @@ is_datetime64_dtype, is_dtype_equal, is_list_like, + is_object_dtype, is_scalar, is_timedelta64_dtype, pandas_dtype, @@ -1688,6 +1693,180 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) + # ------------------------------------------------------------------------ + # GroupBy Methods + + def groupby_any_all( + self, *, ngroups: int, ids: npt.NDArray[np.intp], val_test: str, skipna: bool + ): + values = self + how = "any_all" + base_func = libgroupby.group_any_all + cython_dtype = np.dtype(np.int8) + needs_counts = False + + from pandas.core.arrays import ( + BaseMaskedArray, + BooleanArray, + ) + + def pre_processing(vals: ArrayLike) -> tuple[np.ndarray, type]: + if is_object_dtype(vals.dtype) and skipna: + # GH#37501: don't raise on pd.NA when skipna=True + mask = isna(vals) + if mask.any(): + # mask on original values computed separately + vals = vals.copy() + vals[mask] = True + elif isinstance(vals, BaseMaskedArray): + vals = vals._data + vals = vals.astype(bool, copy=False) + return vals.view(np.int8), bool + + def post_processing( + result: np.ndarray, + inference: type, + nullable: bool = False, + ) -> ArrayLike: + if nullable: + return BooleanArray(result.astype(bool, copy=False), result == -1) + else: + return result.astype(inference, copy=False) + + values = values.T + ncols = 1 if values.ndim == 1 else values.shape[1] + + result: ArrayLike + result = np.zeros(ngroups * ncols, dtype=cython_dtype) + result = result.reshape((ngroups, ncols)) + + func = partial(base_func, out=result, labels=ids) + + inferences = None + + if needs_counts: + counts = np.zeros(ngroups, dtype=np.int64) + func = partial(func, counts=counts) + + vals = values + if pre_processing: + vals, inferences = pre_processing(vals) + + vals = vals.astype(cython_dtype, copy=False) + if vals.ndim == 1: + vals = vals.reshape((-1, 1)) + func = partial(func, values=vals) + + if how != "std" or isinstance(values, BaseMaskedArray): + mask = isna(values).view(np.uint8) + if mask.ndim == 1: + mask = mask.reshape(-1, 1) + func = partial(func, mask=mask) + + if how != "std": + is_nullable = isinstance(values, BaseMaskedArray) + func = partial(func, nullable=is_nullable) + + elif isinstance(values, BaseMaskedArray): + result_mask = np.zeros(result.shape, dtype=np.bool_) + func = partial(func, result_mask=result_mask) + + func(val_test=val_test, skipna=skipna) # Call func to modify result in place + + if values.ndim == 1: + assert result.shape[1] == 1, result.shape + result = result[:, 0] + + if post_processing: + pp_kwargs: dict[str, bool | np.ndarray] = {} + pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray) + if how == "std" and pp_kwargs["nullable"]: + pp_kwargs["result_mask"] = result_mask + + result = post_processing(result, inferences, **pp_kwargs) + + return result.T + + def groupby_std(self, *, ngroups: int, ids: npt.NDArray[np.intp], ddof: int): + values = self + how = "std" + base_func = libgroupby.group_var + cython_dtype = np.dtype(np.float64) + needs_counts = True + + from pandas.core.arrays import ( + BaseMaskedArray, + FloatingArray, + ) + + def pre_processing(values): + if isinstance(values, BaseMaskedArray): + return values._data, None + return values, None + + def post_processing( + vals, inference, nullable: bool = False, result_mask=None + ) -> ArrayLike: + if nullable: + if result_mask.ndim == 2: + result_mask = result_mask[:, 0] + return FloatingArray(np.sqrt(vals), result_mask.view(np.bool_)) + return np.sqrt(vals) + + values = values.T + ncols = 1 if values.ndim == 1 else values.shape[1] + + result: ArrayLike + result = np.zeros(ngroups * ncols, dtype=cython_dtype) + result = result.reshape((ngroups, ncols)) + + func = partial(base_func, out=result, labels=ids) + + inferences = None + + if needs_counts: + counts = np.zeros(ngroups, dtype=np.int64) + func = partial(func, counts=counts) + + vals = values + if pre_processing: + vals, inferences = pre_processing(vals) + + vals = vals.astype(cython_dtype, copy=False) + if vals.ndim == 1: + vals = vals.reshape((-1, 1)) + func = partial(func, values=vals) + + if how != "std" or isinstance(values, BaseMaskedArray): + mask = isna(values).view(np.uint8) + if mask.ndim == 1: + mask = mask.reshape(-1, 1) + func = partial(func, mask=mask) + + if how != "std": + is_nullable = isinstance(values, BaseMaskedArray) + func = partial(func, nullable=is_nullable) + + elif isinstance(values, BaseMaskedArray): + result_mask = np.zeros(result.shape, dtype=np.bool_) + func = partial(func, result_mask=result_mask) + + func(ddof=ddof) # Call func to modify result in place + + if values.ndim == 1: + assert result.shape[1] == 1, result.shape + result = result[:, 0] + + if post_processing: + pp_kwargs: dict[str, bool | np.ndarray] = {} + pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray) + if how == "std" and pp_kwargs["nullable"]: + pp_kwargs["result_mask"] = result_mask + + result = post_processing(result, inferences, **pp_kwargs) + + return result.T + class ExtensionArraySupportsAnyAll(ExtensionArray): def any(self, *, skipna: bool = True) -> bool: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 696f924e31179..c8cd5835f11d0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3719,9 +3719,21 @@ def _get_cythonized_result( ids, _, ngroups = grouper.group_info - base_func = partial(base_func, labels=ids) - def blk_func(values: ArrayLike) -> ArrayLike: + + if isinstance(values, ExtensionArray): + if how == "std": + return values.groupby_std( + ngroups=ngroups, ids=ids, ddof=kwargs["ddof"] + ) + elif how == "any_all": + return values.groupby_any_all( + ngroups=ngroups, + ids=ids, + skipna=kwargs["skipna"], + val_test=kwargs["val_test"], + ) + values = values.T ncols = 1 if values.ndim == 1 else values.shape[1] @@ -3729,7 +3741,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: result = np.zeros(ngroups * ncols, dtype=cython_dtype) result = result.reshape((ngroups, ncols)) - func = partial(base_func, out=result) + func = partial(base_func, out=result, labels=ids) inferences = None From cfa3bfd612095be67841e514fa40310997db2945 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Feb 2023 14:18:17 -0800 Subject: [PATCH 2/6] Implement MaskedArray methods --- pandas/_libs/groupby.pyi | 1 + pandas/core/arrays/base.py | 195 +++++++++------------------------ pandas/core/arrays/masked.py | 62 +++++++++++ pandas/core/groupby/groupby.py | 43 +------- 4 files changed, 118 insertions(+), 183 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 0f72b2b72141f..cd87c406d7f32 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -55,6 +55,7 @@ def group_any_all( mask: np.ndarray, # const uint8_t[::1] val_test: Literal["any", "all"], skipna: bool, + nullable: bool = ..., ) -> None: ... def group_sum( out: np.ndarray, # complexfloatingintuint_t[:, ::1] diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0363a749572d7..d2b154c298b6f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -8,7 +8,6 @@ """ from __future__ import annotations -from functools import partial import operator from typing import ( TYPE_CHECKING, @@ -1697,174 +1696,78 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # GroupBy Methods def groupby_any_all( - self, *, ngroups: int, ids: npt.NDArray[np.intp], val_test: str, skipna: bool + self, + *, + ngroups: int, + ids: npt.NDArray[np.intp], + val_test: Literal["any", "all"], + skipna: bool, ): - values = self - how = "any_all" - base_func = libgroupby.group_any_all - cython_dtype = np.dtype(np.int8) - needs_counts = False - - from pandas.core.arrays import ( - BaseMaskedArray, - BooleanArray, - ) - - def pre_processing(vals: ArrayLike) -> tuple[np.ndarray, type]: - if is_object_dtype(vals.dtype) and skipna: - # GH#37501: don't raise on pd.NA when skipna=True - mask = isna(vals) - if mask.any(): - # mask on original values computed separately - vals = vals.copy() - vals[mask] = True - elif isinstance(vals, BaseMaskedArray): - vals = vals._data - vals = vals.astype(bool, copy=False) - return vals.view(np.int8), bool - - def post_processing( - result: np.ndarray, - inference: type, - nullable: bool = False, - ) -> ArrayLike: - if nullable: - return BooleanArray(result.astype(bool, copy=False), result == -1) - else: - return result.astype(inference, copy=False) - - values = values.T + values = self.T ncols = 1 if values.ndim == 1 else values.shape[1] - result: ArrayLike - result = np.zeros(ngroups * ncols, dtype=cython_dtype) - result = result.reshape((ngroups, ncols)) - - func = partial(base_func, out=result, labels=ids) - - inferences = None + result = np.zeros((ngroups, ncols), dtype=np.int8) - if needs_counts: - counts = np.zeros(ngroups, dtype=np.int64) - func = partial(func, counts=counts) + obj = values + if is_object_dtype(self.dtype) and skipna: + # GH#37501: don't raise on pd.NA when skipna=True + mask = self.isna() + if mask.any(): + # mask on original values computed separately + obj = obj.copy() + obj[mask] = True - vals = values - if pre_processing: - vals, inferences = pre_processing(vals) - - vals = vals.astype(cython_dtype, copy=False) + vals = obj.astype(bool, copy=False) + vals = vals.view(np.int8) if vals.ndim == 1: - vals = vals.reshape((-1, 1)) - func = partial(func, values=vals) - - if how != "std" or isinstance(values, BaseMaskedArray): - mask = isna(values).view(np.uint8) - if mask.ndim == 1: - mask = mask.reshape(-1, 1) - func = partial(func, mask=mask) - - if how != "std": - is_nullable = isinstance(values, BaseMaskedArray) - func = partial(func, nullable=is_nullable) - - elif isinstance(values, BaseMaskedArray): - result_mask = np.zeros(result.shape, dtype=np.bool_) - func = partial(func, result_mask=result_mask) - - func(val_test=val_test, skipna=skipna) # Call func to modify result in place + vals = vals.reshape(-1, 1) + + mask = isna(values).view(np.uint8) + if mask.ndim == 1: + mask = mask.reshape(-1, 1) + + # Call func to modify result in place + libgroupby.group_any_all( + out=result, + labels=ids, + values=vals, + val_test=val_test, + skipna=skipna, + mask=mask, + nullable=False, + ) if values.ndim == 1: assert result.shape[1] == 1, result.shape result = result[:, 0] - if post_processing: - pp_kwargs: dict[str, bool | np.ndarray] = {} - pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray) - if how == "std" and pp_kwargs["nullable"]: - pp_kwargs["result_mask"] = result_mask - - result = post_processing(result, inferences, **pp_kwargs) - + result.astype(bool, copy=False) return result.T def groupby_std(self, *, ngroups: int, ids: npt.NDArray[np.intp], ddof: int): - values = self - how = "std" - base_func = libgroupby.group_var cython_dtype = np.dtype(np.float64) - needs_counts = True - from pandas.core.arrays import ( - BaseMaskedArray, - FloatingArray, - ) - - def pre_processing(values): - if isinstance(values, BaseMaskedArray): - return values._data, None - return values, None - - def post_processing( - vals, inference, nullable: bool = False, result_mask=None - ) -> ArrayLike: - if nullable: - if result_mask.ndim == 2: - result_mask = result_mask[:, 0] - return FloatingArray(np.sqrt(vals), result_mask.view(np.bool_)) - return np.sqrt(vals) - - values = values.T - ncols = 1 if values.ndim == 1 else values.shape[1] - - result: ArrayLike - result = np.zeros(ngroups * ncols, dtype=cython_dtype) - result = result.reshape((ngroups, ncols)) + ncols = 1 if self.ndim == 1 else self.shape[0] - func = partial(base_func, out=result, labels=ids) + result = np.zeros((ngroups, ncols), dtype=cython_dtype) + counts = np.zeros(ngroups, dtype=np.int64) - inferences = None - - if needs_counts: - counts = np.zeros(ngroups, dtype=np.int64) - func = partial(func, counts=counts) - - vals = values - if pre_processing: - vals, inferences = pre_processing(vals) - - vals = vals.astype(cython_dtype, copy=False) - if vals.ndim == 1: - vals = vals.reshape((-1, 1)) - func = partial(func, values=vals) + vals = self.astype(cython_dtype, copy=False) - if how != "std" or isinstance(values, BaseMaskedArray): - mask = isna(values).view(np.uint8) - if mask.ndim == 1: - mask = mask.reshape(-1, 1) - func = partial(func, mask=mask) - - if how != "std": - is_nullable = isinstance(values, BaseMaskedArray) - func = partial(func, nullable=is_nullable) - - elif isinstance(values, BaseMaskedArray): - result_mask = np.zeros(result.shape, dtype=np.bool_) - func = partial(func, result_mask=result_mask) - - func(ddof=ddof) # Call func to modify result in place + # Call func to modify result in place + libgroupby.group_var( + out=result, + labels=ids, + values=np.atleast_2d(vals).T, + counts=counts, + ddof=ddof, + ) - if values.ndim == 1: + if self.ndim == 1: assert result.shape[1] == 1, result.shape result = result[:, 0] - if post_processing: - pp_kwargs: dict[str, bool | np.ndarray] = {} - pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray) - if how == "std" and pp_kwargs["nullable"]: - pp_kwargs["result_mask"] = result_mask - - result = post_processing(result, inferences, **pp_kwargs) - + result = np.sqrt(result) return result.T diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8324d4b2618f1..205c8ef0ee581 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -14,6 +14,7 @@ import numpy as np from pandas._libs import ( + groupby as libgroupby, lib, missing as libmissing, ) @@ -1383,3 +1384,64 @@ def _accumulate( data, mask = op(data, mask, skipna=skipna, **kwargs) return type(self)(data, mask, copy=False) + + # ------------------------------------------------------------------------ + # GroupBy Methods + + def groupby_any_all( + self, + *, + ngroups: int, + ids: npt.NDArray[np.intp], + val_test: Literal["any", "all"], + skipna: bool, + ): + from pandas.core.arrays import BooleanArray + + result = np.zeros(ngroups * 1, dtype=np.int8).reshape(-1, 1) + vals = self._data.astype(bool, copy=False).view(np.int8).reshape(-1, 1) + mask = self.isna().view(np.uint8).reshape(-1, 1) + + # Call func to modify result in place + libgroupby.group_any_all( + out=result, + labels=ids, + values=vals, + mask=mask, + val_test=val_test, + skipna=skipna, + nullable=True, + ) + + assert result.shape[1] == 1, result.shape + result = result[:, 0] + + return BooleanArray(result.astype(bool, copy=False), result == -1) + + def groupby_std(self, *, ngroups: int, ids: npt.NDArray[np.intp], ddof: int): + from pandas.core.arrays import FloatingArray + + result = np.zeros(ngroups * 1, dtype=np.float64).reshape(-1, 1) + counts = np.zeros(ngroups, dtype=np.int64) + vals = self._data.astype(np.float64, copy=False).reshape(-1, 1) + mask = self.isna().view(np.uint8).reshape(-1, 1) + result_mask = np.zeros(result.shape, dtype=np.bool_) + + # Call func to modify result in place + libgroupby.group_var( + out=result, + labels=ids, + values=vals, + mask=mask, + counts=counts, + result_mask=result_mask, + ddof=ddof, + ) + + assert result.shape[1] == 1, result.shape + result = result[:, 0] + + assert result_mask.shape[1] == 1, result_mask.shape + result_mask = result_mask[:, 0] + + return FloatingArray(np.sqrt(result), result_mask.view(np.bool_)) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c8cd5835f11d0..54ccdacc8cb5a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -97,7 +97,6 @@ class providing the base-class of operations. from pandas.core._numba import executor from pandas.core.arrays import ( BaseMaskedArray, - BooleanArray, Categorical, ExtensionArray, FloatingArray, @@ -1808,8 +1807,6 @@ def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]: # mask on original values computed separately vals = vals.copy() vals[mask] = True - elif isinstance(vals, BaseMaskedArray): - vals = vals._data vals = vals.astype(bool, copy=False) return vals.view(np.int8), bool @@ -1818,10 +1815,7 @@ def result_to_bool( inference: type, nullable: bool = False, ) -> ArrayLike: - if nullable: - return BooleanArray(result.astype(bool, copy=False), result == -1) - else: - return result.astype(inference, copy=False) + return result.astype(inference, copy=False) return self._get_cythonized_result( libgroupby.group_any_all, @@ -2105,18 +2099,9 @@ def std( f"numeric_only={numeric_only} and dtype {self.obj.dtype}" ) - def _preprocessing(values): - if isinstance(values, BaseMaskedArray): - return values._data, None - return values, None - def _postprocessing( vals, inference, nullable: bool = False, result_mask=None ) -> ArrayLike: - if nullable: - if result_mask.ndim == 2: - result_mask = result_mask[:, 0] - return FloatingArray(np.sqrt(vals), result_mask.view(np.bool_)) return np.sqrt(vals) result = self._get_cythonized_result( @@ -2124,7 +2109,7 @@ def _postprocessing( cython_dtype=np.dtype(np.float64), numeric_only=numeric_only, needs_counts=True, - pre_processing=_preprocessing, + pre_processing=None, post_processing=_postprocessing, ddof=ddof, how="std", @@ -3737,9 +3722,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: values = values.T ncols = 1 if values.ndim == 1 else values.shape[1] - result: ArrayLike - result = np.zeros(ngroups * ncols, dtype=cython_dtype) - result = result.reshape((ngroups, ncols)) + result = np.zeros((ngroups, ncols), dtype=cython_dtype) func = partial(base_func, out=result, labels=ids) @@ -3758,19 +3741,12 @@ def blk_func(values: ArrayLike) -> ArrayLike: vals = vals.reshape((-1, 1)) func = partial(func, values=vals) - if how != "std" or isinstance(values, BaseMaskedArray): + if how != "std": mask = isna(values).view(np.uint8) if mask.ndim == 1: mask = mask.reshape(-1, 1) func = partial(func, mask=mask) - - if how != "std": - is_nullable = isinstance(values, BaseMaskedArray) - func = partial(func, nullable=is_nullable) - - elif isinstance(values, BaseMaskedArray): - result_mask = np.zeros(result.shape, dtype=np.bool_) - func = partial(func, result_mask=result_mask) + func = partial(func, nullable=False) func(**kwargs) # Call func to modify result in place @@ -3778,14 +3754,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: assert result.shape[1] == 1, result.shape result = result[:, 0] - if post_processing: - pp_kwargs: dict[str, bool | np.ndarray] = {} - pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray) - if how == "std" and pp_kwargs["nullable"]: - pp_kwargs["result_mask"] = result_mask - - result = post_processing(result, inferences, **pp_kwargs) - + result = post_processing(result, inferences, nullable=False) return result.T obj = self._obj_with_exclusions From 87ade9fa4e0b523582a3e1e1476c94ed9f948219 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Feb 2023 15:29:08 -0800 Subject: [PATCH 3/6] REF: dispatch to PandasArray --- pandas/core/arrays/base.py | 24 +++----- pandas/core/groupby/groupby.py | 105 ++------------------------------- 2 files changed, 14 insertions(+), 115 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d2b154c298b6f..3b2a73d493316 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1703,12 +1703,10 @@ def groupby_any_all( val_test: Literal["any", "all"], skipna: bool, ): - values = self.T - ncols = 1 if values.ndim == 1 else values.shape[1] - + ncols = 1 if self.ndim == 1 else self.shape[0] result = np.zeros((ngroups, ncols), dtype=np.int8) - obj = values + obj = self if is_object_dtype(self.dtype) and skipna: # GH#37501: don't raise on pd.NA when skipna=True mask = self.isna() @@ -1717,31 +1715,25 @@ def groupby_any_all( obj = obj.copy() obj[mask] = True - vals = obj.astype(bool, copy=False) - vals = vals.view(np.int8) - if vals.ndim == 1: - vals = vals.reshape(-1, 1) - - mask = isna(values).view(np.uint8) - if mask.ndim == 1: - mask = mask.reshape(-1, 1) + vals = obj.astype(bool, copy=False).view(np.int8) + mask = self.isna().view(np.uint8) # Call func to modify result in place libgroupby.group_any_all( out=result, labels=ids, - values=vals, + values=np.atleast_2d(vals).T, + mask=np.atleast_2d(mask).T, val_test=val_test, skipna=skipna, - mask=mask, nullable=False, ) - if values.ndim == 1: + if self.ndim == 1: assert result.shape[1] == 1, result.shape result = result[:, 0] - result.astype(bool, copy=False) + result = result.astype(bool, copy=False) return result.T def groupby_std(self, *, ngroups: int, ids: npt.NDArray[np.intp], ddof: int): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 54ccdacc8cb5a..744c26977982b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -100,6 +100,7 @@ class providing the base-class of operations. Categorical, ExtensionArray, FloatingArray, + PandasArray, ) from pandas.core.base import ( PandasObject, @@ -1798,31 +1799,8 @@ def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool): """ Shared func to call any / all Cython GroupBy implementations. """ - - def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]: - if is_object_dtype(vals.dtype) and skipna: - # GH#37501: don't raise on pd.NA when skipna=True - mask = isna(vals) - if mask.any(): - # mask on original values computed separately - vals = vals.copy() - vals[mask] = True - vals = vals.astype(bool, copy=False) - return vals.view(np.int8), bool - - def result_to_bool( - result: np.ndarray, - inference: type, - nullable: bool = False, - ) -> ArrayLike: - return result.astype(inference, copy=False) - return self._get_cythonized_result( - libgroupby.group_any_all, numeric_only=False, - cython_dtype=np.dtype(np.int8), - pre_processing=objs_to_bool, - post_processing=result_to_bool, val_test=val_test, skipna=skipna, ) @@ -2099,18 +2077,8 @@ def std( f"numeric_only={numeric_only} and dtype {self.obj.dtype}" ) - def _postprocessing( - vals, inference, nullable: bool = False, result_mask=None - ) -> ArrayLike: - return np.sqrt(vals) - result = self._get_cythonized_result( - libgroupby.group_var, - cython_dtype=np.dtype(np.float64), numeric_only=numeric_only, - needs_counts=True, - pre_processing=None, - post_processing=_postprocessing, ddof=ddof, how="std", ) @@ -3651,12 +3619,7 @@ def cummax( @final def _get_cythonized_result( self, - base_func: Callable, - cython_dtype: np.dtype, numeric_only: bool = False, - needs_counts: bool = False, - pre_processing=None, - post_processing=None, how: str = "any_all", **kwargs, ): @@ -3665,27 +3628,8 @@ def _get_cythonized_result( Parameters ---------- - base_func : callable, Cythonized function to be called - cython_dtype : np.dtype - Type of the array that will be modified by the Cython call. numeric_only : bool, default False Whether only numeric datatypes should be computed - needs_counts : bool, default False - Whether the counts should be a part of the Cython call - pre_processing : function, default None - Function to be applied to `values` prior to passing to Cython. - Function should return a tuple where the first element is the - values to be passed to Cython and the second element is an optional - type which the values should be converted to after being returned - by the Cython operation. This function is also responsible for - raising a TypeError if the values have an invalid type. Raises - if `needs_values` is False. - post_processing : function, default None - Function to be applied to result of Cython function. Should accept - an array of values as the first argument and type inferences as its - second argument, i.e. the signature should be - (ndarray, Type). If `needs_nullable=True`, a third argument should be - `nullable`, to allow for processing specific to nullable values. how : str, default any_all Determines if any/all cython interface or std interface is used. **kwargs : dict @@ -3695,11 +3639,6 @@ def _get_cythonized_result( ------- `Series` or `DataFrame` with filled values """ - if post_processing and not callable(post_processing): - raise ValueError("'post_processing' must be a callable!") - if pre_processing and not callable(pre_processing): - raise ValueError("'pre_processing' must be a callable!") - grouper = self.grouper ids, _, ngroups = grouper.group_info @@ -3718,44 +3657,12 @@ def blk_func(values: ArrayLike) -> ArrayLike: skipna=kwargs["skipna"], val_test=kwargs["val_test"], ) + else: + raise NotImplementedError - values = values.T - ncols = 1 if values.ndim == 1 else values.shape[1] - - result = np.zeros((ngroups, ncols), dtype=cython_dtype) - - func = partial(base_func, out=result, labels=ids) - - inferences = None - - if needs_counts: - counts = np.zeros(ngroups, dtype=np.int64) - func = partial(func, counts=counts) - - vals = values - if pre_processing: - vals, inferences = pre_processing(vals) - - vals = vals.astype(cython_dtype, copy=False) - if vals.ndim == 1: - vals = vals.reshape((-1, 1)) - func = partial(func, values=vals) - - if how != "std": - mask = isna(values).view(np.uint8) - if mask.ndim == 1: - mask = mask.reshape(-1, 1) - func = partial(func, mask=mask) - func = partial(func, nullable=False) - - func(**kwargs) # Call func to modify result in place - - if values.ndim == 1: - assert result.shape[1] == 1, result.shape - result = result[:, 0] - - result = post_processing(result, inferences, nullable=False) - return result.T + arr = PandasArray(values) + result = blk_func(arr) + return result obj = self._obj_with_exclusions From ee0e8138d8a5748145d3dc7b696ae0cd57e36174 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Feb 2023 18:02:35 -0800 Subject: [PATCH 4/6] mypy fixup --- pandas/core/arrays/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3b2a73d493316..0b8f48788b6e5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1709,14 +1709,14 @@ def groupby_any_all( obj = self if is_object_dtype(self.dtype) and skipna: # GH#37501: don't raise on pd.NA when skipna=True - mask = self.isna() - if mask.any(): + self_mask = self.isna() + if self_mask.any(): # mask on original values computed separately obj = obj.copy() - obj[mask] = True + obj[self_mask] = True vals = obj.astype(bool, copy=False).view(np.int8) - mask = self.isna().view(np.uint8) + mask = np.asarray(self.isna()).view(np.uint8) # Call func to modify result in place libgroupby.group_any_all( From 25f6802d2173e91935f827bff08f0a82bdda207f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 9 Feb 2023 14:21:34 -0800 Subject: [PATCH 5/6] remove leftover *1 --- pandas/core/arrays/masked.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d4a2147f05ce3..3d1c5102b2030 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1405,7 +1405,7 @@ def groupby_any_all( ): from pandas.core.arrays import BooleanArray - result = np.zeros(ngroups * 1, dtype=np.int8).reshape(-1, 1) + result = np.zeros(ngroups, dtype=np.int8).reshape(-1, 1) vals = self._data.astype(bool, copy=False).view(np.int8).reshape(-1, 1) mask = self.isna().view(np.uint8).reshape(-1, 1) @@ -1428,7 +1428,7 @@ def groupby_any_all( def groupby_std(self, *, ngroups: int, ids: npt.NDArray[np.intp], ddof: int): from pandas.core.arrays import FloatingArray - result = np.zeros(ngroups * 1, dtype=np.float64).reshape(-1, 1) + result = np.zeros(ngroups, dtype=np.float64).reshape(-1, 1) counts = np.zeros(ngroups, dtype=np.int64) vals = self._data.astype(np.float64, copy=False).reshape(-1, 1) mask = self.isna().view(np.uint8).reshape(-1, 1) From 9844dea6d22ea292c3b05131fa36e97de22c7a7e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Mar 2023 11:16:50 -0800 Subject: [PATCH 6/6] restore warnings import --- pandas/core/groupby/groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4b80856e5c59c..d853bf4a21d0e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -29,6 +29,7 @@ class providing the base-class of operations. cast, final, ) +import warnings import numpy as np @@ -1902,7 +1903,6 @@ def std( return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof)) else: - result = self._get_cythonized_result( numeric_only=numeric_only, ddof=ddof, @@ -3601,7 +3601,6 @@ def _get_cythonized_result( ids, _, ngroups = grouper.group_info def blk_func(values: ArrayLike) -> ArrayLike: - if isinstance(values, ExtensionArray): if how == "std": return values._groupby_std(