From 9528ffc7d0bacf449de2a0c18dc82a9a840f9695 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 3 Feb 2023 17:51:22 -0800 Subject: [PATCH 1/7] REF: move groupby reduction methods to EA --- pandas/core/arrays/base.py | 53 +++++++++ pandas/core/arrays/categorical.py | 24 ++++ pandas/core/arrays/masked.py | 31 +++++ pandas/core/groupby/ops.py | 188 +++--------------------------- 4 files changed, 125 insertions(+), 171 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c261a41e1e77e..29d3127fee492 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1688,6 +1688,59 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) + # ------------------------------------------------------------------------ + # GroupBy Methods + + def groupby_op( + self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs + ): + from pandas.core.arrays import ( + DatetimeArray, + PeriodArray, + TimedeltaArray, + ) + from pandas.core.arrays.string_ import StringDtype + + # GH#43682 + if isinstance(self, (DatetimeArray, PeriodArray, TimedeltaArray)): + # All of the functions implemented here are ordinal, so we can + # operate on the tz-naive equivalents + npvalues = self._ndarray.view("M8[ns]") + elif isinstance(self.dtype, StringDtype): + # StringArray + npvalues = self.to_numpy(object, na_value=np.nan) + else: + raise NotImplementedError( + f"function is not implemented for this dtype: {self.dtype}" + ) + + res_values = op._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=None, + **kwargs, + ) + + if op.how in op.cast_blocklist: + # i.e. how in ["rank"], since other cast_blocklist methods don't go + # through cython_operation + return res_values + + if isinstance(self.dtype, StringDtype): + dtype = self.dtype + string_array_cls = dtype.construct_array_type() + return string_array_cls._from_sequence(res_values, dtype=dtype) + + elif isinstance(self, (DatetimeArray, TimedeltaArray, PeriodArray)): + # In to_cython_values we took a view as M8[ns] + assert res_values.dtype == "M8[ns]" + res_values = res_values.view(self._ndarray.dtype) + return self._from_backing_data(res_values) + else: + raise NotImplementedError + class ExtensionArraySupportsAnyAll(ExtensionArray): def any(self, *, skipna: bool = True) -> bool: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f816b30b825b7..ceae5c385967b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2318,6 +2318,30 @@ def _str_get_dummies(self, sep: str = "|"): return PandasArray(self.astype(str))._str_get_dummies(sep) + # ------------------------------------------------------------------------ + # GroupBy Methods + + def groupby_op( + self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs + ): + assert op.how == "rank" # the only one implemented ATM + assert self.ordered # checked earlier + mask = self.isna() + npvalues = self._ndarray + + res_values = op._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=mask, + **kwargs, + ) + + # If we ever have more than just "rank" here, we'll need to do + # `if op.how in op.cast_blocklist` like we do for other dtypes. + return res_values + # The Series.cat accessor diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8324d4b2618f1..bf12a9ac01a5a 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1383,3 +1383,34 @@ def _accumulate( data, mask = op(data, mask, skipna=skipna, **kwargs) return type(self)(data, mask, copy=False) + + # ------------------------------------------------------------------ + # GroupBy Methods + + def groupby_op( + self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs + ): + # libgroupby functions are responsible for NOT altering mask + mask = self._mask + if op.kind != "aggregate": + result_mask = mask.copy() + else: + result_mask = np.zeros(ngroups, dtype=bool) + + res_values = op._cython_op_ndim_compat( + self._data, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + + if op.how == "ohlc": + arity = op._cython_arity.get(op.how, 1) + result_mask = np.tile(result_mask, (arity, 1)).T + + # res_values should already have the correct dtype, we just need to + # wrap in a MaskedArray + return self._maybe_mask_result(res_values, result_mask) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index bff61ec135d74..6d959bd7a3a79 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -65,18 +65,6 @@ maybe_fill, ) -from pandas.core.arrays import ( - Categorical, - DatetimeArray, - ExtensionArray, - PeriodArray, - TimedeltaArray, -) -from pandas.core.arrays.masked import ( - BaseMaskedArray, - BaseMaskedDtype, -) -from pandas.core.arrays.string_ import StringDtype from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -220,7 +208,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: return values # TODO: general case implementation overridable by EAs. - def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): + def _disallow_invalid_ops(self, dtype: DtypeObj): """ Check if we can do this operation with our cython functions. @@ -233,7 +221,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): """ how = self.how - if is_numeric: + if is_numeric_dtype(dtype): # never an invalid op for those dtypes, so return early as fastpath return @@ -321,145 +309,6 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: return np.dtype(np.float64) return dtype - @final - def _ea_wrap_cython_operation( - self, - values: ExtensionArray, - min_count: int, - ngroups: int, - comp_ids: np.ndarray, - **kwargs, - ) -> ArrayLike: - """ - If we have an ExtensionArray, unwrap, call _cython_operation, and - re-wrap if appropriate. - """ - if isinstance(values, BaseMaskedArray): - return self._masked_ea_wrap_cython_operation( - values, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - **kwargs, - ) - - elif isinstance(values, Categorical): - assert self.how == "rank" # the only one implemented ATM - assert values.ordered # checked earlier - mask = values.isna() - npvalues = values._ndarray - - res_values = self._cython_op_ndim_compat( - npvalues, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=mask, - **kwargs, - ) - - # If we ever have more than just "rank" here, we'll need to do - # `if self.how in self.cast_blocklist` like we do for other dtypes. - return res_values - - npvalues = self._ea_to_cython_values(values) - - res_values = self._cython_op_ndim_compat( - npvalues, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=None, - **kwargs, - ) - - if self.how in self.cast_blocklist: - # i.e. how in ["rank"], since other cast_blocklist methods don't go - # through cython_operation - return res_values - - return self._reconstruct_ea_result(values, res_values) - - # TODO: general case implementation overridable by EAs. - def _ea_to_cython_values(self, values: ExtensionArray) -> np.ndarray: - # GH#43682 - if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)): - # All of the functions implemented here are ordinal, so we can - # operate on the tz-naive equivalents - npvalues = values._ndarray.view("M8[ns]") - elif isinstance(values.dtype, StringDtype): - # StringArray - npvalues = values.to_numpy(object, na_value=np.nan) - else: - raise NotImplementedError( - f"function is not implemented for this dtype: {values.dtype}" - ) - return npvalues - - # TODO: general case implementation overridable by EAs. - def _reconstruct_ea_result( - self, values: ExtensionArray, res_values: np.ndarray - ) -> ExtensionArray: - """ - Construct an ExtensionArray result from an ndarray result. - """ - dtype: BaseMaskedDtype | StringDtype - - if isinstance(values.dtype, StringDtype): - dtype = values.dtype - string_array_cls = dtype.construct_array_type() - return string_array_cls._from_sequence(res_values, dtype=dtype) - - elif isinstance(values, (DatetimeArray, TimedeltaArray, PeriodArray)): - # In to_cython_values we took a view as M8[ns] - assert res_values.dtype == "M8[ns]" - res_values = res_values.view(values._ndarray.dtype) - return values._from_backing_data(res_values) - - raise NotImplementedError - - @final - def _masked_ea_wrap_cython_operation( - self, - values: BaseMaskedArray, - min_count: int, - ngroups: int, - comp_ids: np.ndarray, - **kwargs, - ) -> BaseMaskedArray: - """ - Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's - and cython algorithms which accept a mask. - """ - orig_values = values - - # libgroupby functions are responsible for NOT altering mask - mask = values._mask - if self.kind != "aggregate": - result_mask = mask.copy() - else: - result_mask = np.zeros(ngroups, dtype=bool) - - arr = values._data - - res_values = self._cython_op_ndim_compat( - arr, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=mask, - result_mask=result_mask, - **kwargs, - ) - - if self.how == "ohlc": - arity = self._cython_arity.get(self.how, 1) - result_mask = np.tile(result_mask, (arity, 1)).T - - # res_values should already have the correct dtype, we just need to - # wrap in a MaskedArray - return orig_values._maybe_mask_result(res_values, result_mask) - @final def _cython_op_ndim_compat( self, @@ -614,6 +463,17 @@ def _call_cython_op( return op_result + @final + def _validate_axis(self, axis: AxisInt, values: ArrayLike) -> None: + if values.ndim > 2: + raise NotImplementedError("number of dimensions is currently limited to 2") + if values.ndim == 2: + assert axis == 1, axis + elif not is_1d_only_ea_dtype(values.dtype): + # Note: it is *not* the case that axis is always 0 for 1-dim values, + # as we can have 1D ExtensionArrays that we need to treat as 2D + assert axis == 0 + @final def cython_operation( self, @@ -628,30 +488,16 @@ def cython_operation( """ Call our cython function, with appropriate pre- and post- processing. """ - if values.ndim > 2: - raise NotImplementedError("number of dimensions is currently limited to 2") - if values.ndim == 2: - assert axis == 1, axis - elif not is_1d_only_ea_dtype(values.dtype): - # Note: it is *not* the case that axis is always 0 for 1-dim values, - # as we can have 1D ExtensionArrays that we need to treat as 2D - assert axis == 0 - - dtype = values.dtype - is_numeric = is_numeric_dtype(dtype) + self._validate_axis(axis, values) # can we do this operation with our cython functions # if not raise NotImplementedError - self._disallow_invalid_ops(dtype, is_numeric) + self._disallow_invalid_ops(values.dtype) if not isinstance(values, np.ndarray): # i.e. ExtensionArray - return self._ea_wrap_cython_operation( - values, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - **kwargs, + return values.groupby_op( + self, min_count=min_count, ngroups=ngroups, ids=comp_ids, **kwargs ) return self._cython_op_ndim_compat( From 1ea4a723c5e0309f28dc24d049b9e92db5bcb0c2 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 3 Feb 2023 22:09:53 -0800 Subject: [PATCH 2/7] REF: move EA-specific checks to EAs --- pandas/core/arrays/base.py | 16 +-------- pandas/core/arrays/categorical.py | 14 ++++++++ pandas/core/arrays/datetimelike.py | 44 ++++++++++++++++++++++++ pandas/core/arrays/sparse/array.py | 8 +++++ pandas/core/groupby/ops.py | 55 ------------------------------ 5 files changed, 67 insertions(+), 70 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 29d3127fee492..83c818039fda8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1694,19 +1694,10 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): def groupby_op( self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs ): - from pandas.core.arrays import ( - DatetimeArray, - PeriodArray, - TimedeltaArray, - ) from pandas.core.arrays.string_ import StringDtype # GH#43682 - if isinstance(self, (DatetimeArray, PeriodArray, TimedeltaArray)): - # All of the functions implemented here are ordinal, so we can - # operate on the tz-naive equivalents - npvalues = self._ndarray.view("M8[ns]") - elif isinstance(self.dtype, StringDtype): + if isinstance(self.dtype, StringDtype): # StringArray npvalues = self.to_numpy(object, na_value=np.nan) else: @@ -1733,11 +1724,6 @@ def groupby_op( string_array_cls = dtype.construct_array_type() return string_array_cls._from_sequence(res_values, dtype=dtype) - elif isinstance(self, (DatetimeArray, TimedeltaArray, PeriodArray)): - # In to_cython_values we took a view as M8[ns] - assert res_values.dtype == "M8[ns]" - res_values = res_values.view(self._ndarray.dtype) - return self._from_backing_data(res_values) else: raise NotImplementedError diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ceae5c385967b..e12562d334e01 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2324,6 +2324,20 @@ def _str_get_dummies(self, sep: str = "|"): def groupby_op( self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs ): + + how = op.how + dtype = self.dtype + if how in ["sum", "prod", "cumsum", "cumprod"]: + raise TypeError(f"{dtype} type does not support {how} operations") + if how in ["min", "max", "rank"] and not dtype.ordered: + # raise TypeError instead of NotImplementedError to ensure we + # don't go down a group-by-group path, since in the empty-groups + # case that would fail to raise + raise TypeError(f"Cannot perform {how} with non-ordered Categorical") + if how not in ["rank"]: + # only "rank" is implemented in cython + raise NotImplementedError(f"{dtype} dtype not supported") + assert op.how == "rank" # the only one implemented ATM assert self.ordered # checked earlier mask = self.isna() diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4b26528e6661c..03c525316ed66 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1573,6 +1573,50 @@ def _mode(self, dropna: bool = True): npmodes = cast(np.ndarray, npmodes) return self._from_backing_data(npmodes) + # ------------------------------------------------------------------ + # GroupBy Methods + + def groupby_op( + self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs + ): + dtype = self.dtype + how = op.how + if dtype.kind == "M": + # Adding/multiplying datetimes is not valid + if how in ["sum", "prod", "cumsum", "cumprod"]: + raise TypeError(f"datetime64 type does not support {how} operations") + elif is_period_dtype(dtype): + # Adding/multiplying Periods is not valid + if how in ["sum", "prod", "cumsum", "cumprod"]: + raise TypeError(f"Period type does not support {how} operations") + else: + # timedeltas we can add but not multiply + if how in ["prod", "cumprod"]: + raise TypeError(f"timedelta64 type does not support {how} operations") + + # All of the functions implemented here are ordinal, so we can + # operate on the tz-naive equivalents + npvalues = self._ndarray.view("M8[ns]") + + res_values = op._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=None, + **kwargs, + ) + + if op.how in op.cast_blocklist: + # i.e. how in ["rank"], since other cast_blocklist methods don't go + # through cython_operation + return res_values + + # We did a view to M8[ns] above, now we go the other direction + assert res_values.dtype == "M8[ns]" + res_values = res_values.view(self._ndarray.dtype) + return self._from_backing_data(res_values) + class DatelikeOps(DatetimeLikeArrayMixin): """ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c8c33d3f52102..dd192e951ee7e 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1814,6 +1814,14 @@ def _formatter(self, boxed: bool = False): # This will infer the correct formatter from the dtype of the values. return None + # ------------------------------------------------------------------------ + # GroupBy Methods + + def groupby_op( + self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs + ): + raise NotImplementedError(f"{self.dtype} dtype not supported") + def _make_sparse( arr: np.ndarray, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6d959bd7a3a79..a1814e811735f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -30,7 +30,6 @@ from pandas._typing import ( ArrayLike, AxisInt, - DtypeObj, NDFrameT, Shape, npt, @@ -50,16 +49,11 @@ is_1d_only_ea_dtype, is_bool_dtype, is_complex_dtype, - is_datetime64_any_dtype, is_float_dtype, is_integer_dtype, is_numeric_dtype, - is_period_dtype, - is_sparse, - is_timedelta64_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import ( isna, maybe_fill, @@ -207,51 +201,6 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: return values - # TODO: general case implementation overridable by EAs. - def _disallow_invalid_ops(self, dtype: DtypeObj): - """ - Check if we can do this operation with our cython functions. - - Raises - ------ - TypeError - This is not a valid operation for this dtype. - NotImplementedError - This may be a valid operation, but does not have a cython implementation. - """ - how = self.how - - if is_numeric_dtype(dtype): - # never an invalid op for those dtypes, so return early as fastpath - return - - if isinstance(dtype, CategoricalDtype): - if how in ["sum", "prod", "cumsum", "cumprod"]: - raise TypeError(f"{dtype} type does not support {how} operations") - if how in ["min", "max", "rank"] and not dtype.ordered: - # raise TypeError instead of NotImplementedError to ensure we - # don't go down a group-by-group path, since in the empty-groups - # case that would fail to raise - raise TypeError(f"Cannot perform {how} with non-ordered Categorical") - if how not in ["rank"]: - # only "rank" is implemented in cython - raise NotImplementedError(f"{dtype} dtype not supported") - - elif is_sparse(dtype): - raise NotImplementedError(f"{dtype} dtype not supported") - elif is_datetime64_any_dtype(dtype): - # Adding/multiplying datetimes is not valid - if how in ["sum", "prod", "cumsum", "cumprod"]: - raise TypeError(f"datetime64 type does not support {how} operations") - elif is_period_dtype(dtype): - # Adding/multiplying Periods is not valid - if how in ["sum", "prod", "cumsum", "cumprod"]: - raise TypeError(f"Period type does not support {how} operations") - elif is_timedelta64_dtype(dtype): - # timedeltas we can add but not multiply - if how in ["prod", "cumprod"]: - raise TypeError(f"timedelta64 type does not support {how} operations") - def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: how = self.how kind = self.kind @@ -490,10 +439,6 @@ def cython_operation( """ self._validate_axis(axis, values) - # can we do this operation with our cython functions - # if not raise NotImplementedError - self._disallow_invalid_ops(values.dtype) - if not isinstance(values, np.ndarray): # i.e. ExtensionArray return values.groupby_op( From 8645cf199733dace07875da7098c3c7466b5ea58 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 4 Feb 2023 12:45:06 -0800 Subject: [PATCH 3/7] REF: dont pass op to groupby_op --- pandas/core/arrays/base.py | 13 ++++++++++++- pandas/core/arrays/categorical.py | 16 +++++++++++++--- pandas/core/arrays/datetimelike.py | 15 +++++++++++++-- pandas/core/arrays/masked.py | 14 +++++++++++++- pandas/core/groupby/ops.py | 13 ++++++++++++- 5 files changed, 63 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 83c818039fda8..378b3b83899e6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1692,9 +1692,20 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # GroupBy Methods def groupby_op( - self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, ): from pandas.core.arrays.string_ import StringDtype + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) # GH#43682 if isinstance(self.dtype, StringDtype): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e12562d334e01..4f0277f10ca43 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2322,10 +2322,20 @@ def _str_get_dummies(self, sep: str = "|"): # GroupBy Methods def groupby_op( - self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, ): + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) - how = op.how dtype = self.dtype if how in ["sum", "prod", "cumsum", "cumprod"]: raise TypeError(f"{dtype} type does not support {how} operations") @@ -2338,7 +2348,7 @@ def groupby_op( # only "rank" is implemented in cython raise NotImplementedError(f"{dtype} dtype not supported") - assert op.how == "rank" # the only one implemented ATM + assert how == "rank" # the only one implemented ATM assert self.ordered # checked earlier mask = self.isna() npvalues = self._ndarray diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 03c525316ed66..0557361906da5 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1577,10 +1577,16 @@ def _mode(self, dropna: bool = True): # GroupBy Methods def groupby_op( - self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, ): dtype = self.dtype - how = op.how if dtype.kind == "M": # Adding/multiplying datetimes is not valid if how in ["sum", "prod", "cumsum", "cumprod"]: @@ -1598,6 +1604,11 @@ def groupby_op( # operate on the tz-naive equivalents npvalues = self._ndarray.view("M8[ns]") + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + res_values = op._cython_op_ndim_compat( npvalues, min_count=min_count, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index bf12a9ac01a5a..0a2e99abac13a 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1388,8 +1388,20 @@ def _accumulate( # GroupBy Methods def groupby_op( - self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, ): + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + # libgroupby functions are responsible for NOT altering mask mask = self._mask if op.kind != "aggregate": diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a1814e811735f..119252d9b5dcc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -128,6 +128,12 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: _cython_arity = {"ohlc": 4} # OHLC + @classmethod + def get_kind_from_how(cls, how: str) -> str: + if how in cls._CYTHON_FUNCTIONS["aggregate"]: + return "aggregate" + return "transform" + # Note: we make this a classmethod and pass kind+how so that caching # works at the class level and not the instance level @classmethod @@ -442,7 +448,12 @@ def cython_operation( if not isinstance(values, np.ndarray): # i.e. ExtensionArray return values.groupby_op( - self, min_count=min_count, ngroups=ngroups, ids=comp_ids, **kwargs + how=self.how, + has_dropped_na=self.has_dropped_na, + min_count=min_count, + ngroups=ngroups, + ids=comp_ids, + **kwargs, ) return self._cython_op_ndim_compat( From ea40255c9d6b8136d27fd755e719103d94c28317 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 4 Feb 2023 15:20:22 -0800 Subject: [PATCH 4/7] mypy fixup --- pandas/core/arrays/sparse/array.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index dd192e951ee7e..787c2d78561f2 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1818,7 +1818,14 @@ def _formatter(self, boxed: bool = False): # GroupBy Methods def groupby_op( - self, op, *, min_count: int, ngroups: int, ids: npt.NDArray[np.intp], **kwargs + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, ): raise NotImplementedError(f"{self.dtype} dtype not supported") From a2e7e64e3e8433bdb3998dfa42f683f91e9b0a6a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 6 Feb 2023 15:47:48 -0800 Subject: [PATCH 5/7] groupby_op -> _groupby_op --- pandas/core/arrays/base.py | 29 +++++++++++++++++++++++++++-- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/masked.py | 2 +- pandas/core/arrays/sparse/array.py | 2 +- pandas/core/groupby/ops.py | 2 +- 6 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 378b3b83899e6..37ae7f2c8e2f8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1691,7 +1691,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # ------------------------------------------------------------------------ # GroupBy Methods - def groupby_op( + def _groupby_op( self, *, how: str, @@ -1700,7 +1700,32 @@ def groupby_op( ngroups: int, ids: npt.NDArray[np.intp], **kwargs, - ): + ) -> ArrayLike: + """ + Dispatch GroupBy reduction or transformation operation. + + This is an *experimental* API to allow ExtensionArray authors to implement + reductions and transformations. The API is subject to change. + + Parameters + ---------- + how : {'sum', 'prod', 'min', 'max', 'mean', 'median', + 'median', 'var', 'nth', 'last', 'ohlc', + 'cumprod', 'cumsum', 'cummin', 'cummax', 'rank'} + has_dropped_na : bool + min_count : int + ngroups : int + ids : np.ndarray[np.intp] + ids[i] gives the integer label for the group that self[i] belongs to. + **kwargs : operation-specific + 'var' -> ['ddof'] + 'cumprod', 'cumsum', 'cummin', 'cummax' -> ['skipna'] + 'rank' -> ['ties_method', 'ascending', 'na_option', 'pct'] + + Returns + ------- + np.ndarray or ExtensionArray + """ from pandas.core.arrays.string_ import StringDtype from pandas.core.groupby.ops import WrappedCythonOp diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4f0277f10ca43..a94f2708014ea 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2321,7 +2321,7 @@ def _str_get_dummies(self, sep: str = "|"): # ------------------------------------------------------------------------ # GroupBy Methods - def groupby_op( + def _groupby_op( self, *, how: str, diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 0557361906da5..a4b6c0cff790f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1576,7 +1576,7 @@ def _mode(self, dropna: bool = True): # ------------------------------------------------------------------ # GroupBy Methods - def groupby_op( + def _groupby_op( self, *, how: str, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 0a2e99abac13a..f03846de1e9c2 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1387,7 +1387,7 @@ def _accumulate( # ------------------------------------------------------------------ # GroupBy Methods - def groupby_op( + def _groupby_op( self, *, how: str, diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 787c2d78561f2..7a3e1272d98aa 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1817,7 +1817,7 @@ def _formatter(self, boxed: bool = False): # ------------------------------------------------------------------------ # GroupBy Methods - def groupby_op( + def _groupby_op( self, *, how: str, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 119252d9b5dcc..0f7fa4806e0ec 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -447,7 +447,7 @@ def cython_operation( if not isinstance(values, np.ndarray): # i.e. ExtensionArray - return values.groupby_op( + return values._groupby_op( how=self.how, has_dropped_na=self.has_dropped_na, min_count=min_count, From d80c0b2a32c1f4c8c7d5963cb9b7a28be03b9544 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 22 Mar 2023 15:35:41 -0700 Subject: [PATCH 6/7] mypy fixup --- pandas/core/arrays/base.py | 7 ++++--- pandas/core/arrays/categorical.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0a8b0382c243d..51e7785c178ac 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1743,8 +1743,8 @@ def _groupby_op( Parameters ---------- - how : {'sum', 'prod', 'min', 'max', 'mean', 'median', - 'median', 'var', 'nth', 'last', 'ohlc', + how : {'any', 'all', 'sum', 'prod', 'min', 'max', 'mean', 'median', + 'median', 'var', 'std', 'sem', 'nth', 'last', 'ohlc', 'cumprod', 'cumsum', 'cummin', 'cummax', 'rank'} has_dropped_na : bool min_count : int @@ -1752,7 +1752,8 @@ def _groupby_op( ids : np.ndarray[np.intp] ids[i] gives the integer label for the group that self[i] belongs to. **kwargs : operation-specific - 'var' -> ['ddof'] + 'any', 'all' -> ['skipna'] + 'var', 'std', 'sem' -> ['ddof'] 'cumprod', 'cumsum', 'cummin', 'cummax' -> ['skipna'] 'rank' -> ['ties_method', 'ascending', 'na_option', 'pct'] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 66c4fa8b8e195..c0179642461aa 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2388,7 +2388,7 @@ def _groupby_op( npvalues = self._ndarray elif how in ["first", "last", "min", "max"]: npvalues = self._ndarray - result_mask = np.zeros(ngroups, dtype=np.uint8) + result_mask = np.zeros(ngroups, dtype=bool) else: # any/all npvalues = self.astype(bool) From fab8725b37b9b152d7f2920f16e1f36f2cf4687b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 31 Mar 2023 13:28:56 -0700 Subject: [PATCH 7/7] mypy fixup --- pandas/core/dtypes/concat.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index d54b37c06c6bd..1ec7e717839bf 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -79,7 +79,12 @@ def concat_compat( # e.g. DatetimeArray # NB: We are assuming here that ensure_wrapped_if_arraylike has # been called where relevant. - return obj._concat_same_type(to_concat_eas, axis=axis) + return obj._concat_same_type( + # error: Unexpected keyword argument "axis" for "_concat_same_type" + # of "ExtensionArray" + to_concat_eas, + axis=axis, # type: ignore[call-arg] + ) # filter empty arrays # 1-d dtypes always are included here