diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a97e41cbc508a..762e135111cf8 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -72,9 +72,6 @@ PeriodArray, TimedeltaArray, ) -from pandas.core.arrays.boolean import BooleanDtype -from pandas.core.arrays.floating import FloatingDtype -from pandas.core.arrays.integer import IntegerDtype from pandas.core.arrays.masked import ( BaseMaskedArray, BaseMaskedDtype, @@ -147,26 +144,6 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: }, } - # "group_any" and "group_all" are also support masks, but don't go - # through WrappedCythonOp - _MASKED_CYTHON_FUNCTIONS = { - "cummin", - "cummax", - "min", - "max", - "last", - "first", - "rank", - "sum", - "ohlc", - "cumprod", - "cumsum", - "prod", - "mean", - "var", - "median", - } - _cython_arity = {"ohlc": 4} # OHLC # Note: we make this a classmethod and pass kind+how so that caching @@ -220,8 +197,8 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how in ["median"]: - # these two only have float64 implementations + if how == "median": + # median only has a float64 implementation # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function values = ensure_float64(values) @@ -293,7 +270,7 @@ def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: out_shape: Shape if how == "ohlc": - out_shape = (ngroups, 4) + out_shape = (ngroups, arity) elif arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" @@ -342,9 +319,6 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: return np.dtype(np.float64) return dtype - def uses_mask(self) -> bool: - return self.how in self._MASKED_CYTHON_FUNCTIONS - @final def _ea_wrap_cython_operation( self, @@ -358,7 +332,7 @@ def _ea_wrap_cython_operation( If we have an ExtensionArray, unwrap, call _cython_operation, and re-wrap if appropriate. """ - if isinstance(values, BaseMaskedArray) and self.uses_mask(): + if isinstance(values, BaseMaskedArray): return self._masked_ea_wrap_cython_operation( values, min_count=min_count, @@ -367,7 +341,7 @@ def _ea_wrap_cython_operation( **kwargs, ) - elif isinstance(values, Categorical) and self.uses_mask(): + elif isinstance(values, Categorical): assert self.how == "rank" # the only one implemented ATM assert values.ordered # checked earlier mask = values.isna() @@ -398,7 +372,7 @@ def _ea_wrap_cython_operation( ) if self.how in self.cast_blocklist: - # i.e. how in ["rank"], since other cast_blocklist methods dont go + # i.e. how in ["rank"], since other cast_blocklist methods don't go # through cython_operation return res_values @@ -411,12 +385,6 @@ def _ea_to_cython_values(self, values: ExtensionArray) -> np.ndarray: # All of the functions implemented here are ordinal, so we can # operate on the tz-naive equivalents npvalues = values._ndarray.view("M8[ns]") - elif isinstance(values.dtype, (BooleanDtype, IntegerDtype)): - # IntegerArray or BooleanArray - npvalues = values.to_numpy("float64", na_value=np.nan) - elif isinstance(values.dtype, FloatingDtype): - # FloatingArray - npvalues = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) elif isinstance(values.dtype, StringDtype): # StringArray npvalues = values.to_numpy(object, na_value=np.nan) @@ -440,12 +408,6 @@ def _reconstruct_ea_result( string_array_cls = dtype.construct_array_type() return string_array_cls._from_sequence(res_values, dtype=dtype) - elif isinstance(values.dtype, BaseMaskedDtype): - new_dtype = self._get_result_dtype(values.dtype.numpy_dtype) - dtype = BaseMaskedDtype.from_numpy_dtype(new_dtype) - masked_array_cls = dtype.construct_array_type() - return masked_array_cls._from_sequence(res_values, dtype=dtype) - elif isinstance(values, (DatetimeArray, TimedeltaArray, PeriodArray)): # In to_cython_values we took a view as M8[ns] assert res_values.dtype == "M8[ns]" @@ -489,7 +451,8 @@ def _masked_ea_wrap_cython_operation( ) if self.how == "ohlc": - result_mask = np.tile(result_mask, (4, 1)).T + arity = self._cython_arity.get(self.how, 1) + result_mask = np.tile(result_mask, (arity, 1)).T # res_values should already have the correct dtype, we just need to # wrap in a MaskedArray @@ -580,7 +543,7 @@ def _call_cython_op( result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) - if self.how in ["min", "max", "mean", "last", "first"]: + if self.how in ["min", "max", "mean", "last", "first", "sum"]: func( out=result, counts=counts, @@ -591,18 +554,6 @@ def _call_cython_op( result_mask=result_mask, is_datetimelike=is_datetimelike, ) - elif self.how in ["sum"]: - # We support datetimelike - func( - out=result, - counts=counts, - values=values, - labels=comp_ids, - mask=mask, - result_mask=result_mask, - min_count=min_count, - is_datetimelike=is_datetimelike, - ) elif self.how in ["var", "ohlc", "prod", "median"]: func( result, @@ -615,31 +566,21 @@ def _call_cython_op( **kwargs, ) else: - func(result, counts, values, comp_ids, min_count) + raise NotImplementedError(f"{self.how} is not implemented") else: # TODO: min_count - if self.uses_mask(): - if self.how != "rank": - # TODO: should rank take result_mask? - kwargs["result_mask"] = result_mask - func( - out=result, - values=values, - labels=comp_ids, - ngroups=ngroups, - is_datetimelike=is_datetimelike, - mask=mask, - **kwargs, - ) - else: - func( - out=result, - values=values, - labels=comp_ids, - ngroups=ngroups, - is_datetimelike=is_datetimelike, - **kwargs, - ) + if self.how != "rank": + # TODO: should rank take result_mask? + kwargs["result_mask"] = result_mask + func( + out=result, + values=values, + labels=comp_ids, + ngroups=ngroups, + is_datetimelike=is_datetimelike, + mask=mask, + **kwargs, + ) if self.kind == "aggregate": # i.e. counts is defined. Locations where count