Skip to content

REF: let EAs override WrappedCythonOp groupby implementations #51166

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1722,6 +1722,82 @@ def map(self, mapper, na_action=None):
"""
return map_array(self, mapper, na_action=na_action)

# ------------------------------------------------------------------------
# GroupBy Methods

def _groupby_op(
self,
*,
how: str,
has_dropped_na: bool,
min_count: int,
ngroups: int,
ids: npt.NDArray[np.intp],
**kwargs,
) -> ArrayLike:
"""
Dispatch GroupBy reduction or transformation operation.

This is an *experimental* API to allow ExtensionArray authors to implement
reductions and transformations. The API is subject to change.

Parameters
----------
how : {'any', 'all', 'sum', 'prod', 'min', 'max', 'mean', 'median',
'median', 'var', 'std', 'sem', 'nth', 'last', 'ohlc',
'cumprod', 'cumsum', 'cummin', 'cummax', 'rank'}
has_dropped_na : bool
min_count : int
ngroups : int
ids : np.ndarray[np.intp]
ids[i] gives the integer label for the group that self[i] belongs to.
**kwargs : operation-specific
'any', 'all' -> ['skipna']
'var', 'std', 'sem' -> ['ddof']
'cumprod', 'cumsum', 'cummin', 'cummax' -> ['skipna']
'rank' -> ['ties_method', 'ascending', 'na_option', 'pct']

Returns
-------
np.ndarray or ExtensionArray
"""
from pandas.core.arrays.string_ import StringDtype
from pandas.core.groupby.ops import WrappedCythonOp

kind = WrappedCythonOp.get_kind_from_how(how)
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

# GH#43682
if isinstance(self.dtype, StringDtype):
# StringArray
npvalues = self.to_numpy(object, na_value=np.nan)
else:
raise NotImplementedError(
f"function is not implemented for this dtype: {self.dtype}"
)

res_values = op._cython_op_ndim_compat(
npvalues,
min_count=min_count,
ngroups=ngroups,
comp_ids=ids,
mask=None,
**kwargs,
)

if op.how in op.cast_blocklist:
# i.e. how in ["rank"], since other cast_blocklist methods don't go
# through cython_operation
return res_values

if isinstance(self.dtype, StringDtype):
dtype = self.dtype
string_array_cls = dtype.construct_array_type()
return string_array_cls._from_sequence(res_values, dtype=dtype)

else:
raise NotImplementedError
Comment on lines +1793 to +1799
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hasn't this if-then already been done above? or is it just to future-proof the code?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

future-proof is a nice way of putting it, yes. this is transplanted from its current position in WrappedCythonOp where the redundant checks are in separate methods



class ExtensionArraySupportsAnyAll(ExtensionArray):
def any(self, *, skipna: bool = True) -> bool:
Expand Down
59 changes: 59 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2386,6 +2386,65 @@ def _str_get_dummies(self, sep: str = "|"):

return PandasArray(self.astype(str))._str_get_dummies(sep)

# ------------------------------------------------------------------------
# GroupBy Methods

def _groupby_op(
self,
*,
how: str,
has_dropped_na: bool,
min_count: int,
ngroups: int,
ids: npt.NDArray[np.intp],
**kwargs,
):
from pandas.core.groupby.ops import WrappedCythonOp

kind = WrappedCythonOp.get_kind_from_how(how)
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

dtype = self.dtype
if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
raise TypeError(f"{dtype} type does not support {how} operations")
if how in ["min", "max", "rank"] and not dtype.ordered:
# raise TypeError instead of NotImplementedError to ensure we
# don't go down a group-by-group path, since in the empty-groups
# case that would fail to raise
raise TypeError(f"Cannot perform {how} with non-ordered Categorical")
if how not in ["rank", "any", "all", "first", "last", "min", "max"]:
if kind == "transform":
raise TypeError(f"{dtype} type does not support {how} operations")
raise TypeError(f"{dtype} dtype does not support aggregation '{how}'")

result_mask = None
mask = self.isna()
if how == "rank":
assert self.ordered # checked earlier
npvalues = self._ndarray
elif how in ["first", "last", "min", "max"]:
npvalues = self._ndarray
result_mask = np.zeros(ngroups, dtype=bool)
else:
# any/all
npvalues = self.astype(bool)

res_values = op._cython_op_ndim_compat(
npvalues,
min_count=min_count,
ngroups=ngroups,
comp_ids=ids,
mask=mask,
result_mask=result_mask,
**kwargs,
)

if how in op.cast_blocklist:
return res_values
elif how in ["first", "last", "min", "max"]:
res_values[result_mask == 1] = -1
return self._from_backing_data(res_values)


# The Series.cat accessor

Expand Down
82 changes: 82 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1549,6 +1549,88 @@ def _mode(self, dropna: bool = True):
npmodes = cast(np.ndarray, npmodes)
return self._from_backing_data(npmodes)

# ------------------------------------------------------------------
# GroupBy Methods

def _groupby_op(
self,
*,
how: str,
has_dropped_na: bool,
min_count: int,
ngroups: int,
ids: npt.NDArray[np.intp],
**kwargs,
):
dtype = self.dtype
if dtype.kind == "M":
# Adding/multiplying datetimes is not valid
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
raise TypeError(f"datetime64 type does not support {how} operations")
if how in ["any", "all"]:
# GH#34479
warnings.warn(
f"'{how}' with datetime64 dtypes is deprecated and will raise in a "
f"future version. Use (obj != pd.Timestamp(0)).{how}() instead.",
FutureWarning,
stacklevel=find_stack_level(),
)

elif isinstance(dtype, PeriodDtype):
# Adding/multiplying Periods is not valid
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
raise TypeError(f"Period type does not support {how} operations")
if how in ["any", "all"]:
# GH#34479
warnings.warn(
f"'{how}' with PeriodDtype is deprecated and will raise in a "
f"future version. Use (obj != pd.Period(0, freq)).{how}() instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
# timedeltas we can add but not multiply
if how in ["prod", "cumprod", "skew"]:
raise TypeError(f"timedelta64 type does not support {how} operations")

# All of the functions implemented here are ordinal, so we can
# operate on the tz-naive equivalents
npvalues = self._ndarray.view("M8[ns]")

from pandas.core.groupby.ops import WrappedCythonOp

kind = WrappedCythonOp.get_kind_from_how(how)
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

res_values = op._cython_op_ndim_compat(
npvalues,
min_count=min_count,
ngroups=ngroups,
comp_ids=ids,
mask=None,
**kwargs,
)

if op.how in op.cast_blocklist:
# i.e. how in ["rank"], since other cast_blocklist methods don't go
# through cython_operation
return res_values

# We did a view to M8[ns] above, now we go the other direction
assert res_values.dtype == "M8[ns]"
if how in ["std", "sem"]:
from pandas.core.arrays import TimedeltaArray

if isinstance(self.dtype, PeriodDtype):
raise TypeError("'std' and 'sem' are not valid for PeriodDtype")
self = cast("DatetimeArray | TimedeltaArray", self)
new_dtype = f"m8[{self.unit}]"
res_values = res_values.view(new_dtype)
return TimedeltaArray(res_values)

res_values = res_values.view(self._ndarray.dtype)
return self._from_backing_data(res_values)


class DatelikeOps(DatetimeLikeArrayMixin):
"""
Expand Down
43 changes: 43 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -1382,3 +1382,46 @@ def _accumulate(
data, mask = op(data, mask, skipna=skipna, **kwargs)

return type(self)(data, mask, copy=False)

# ------------------------------------------------------------------
# GroupBy Methods

def _groupby_op(
self,
*,
how: str,
has_dropped_na: bool,
min_count: int,
ngroups: int,
ids: npt.NDArray[np.intp],
**kwargs,
):
from pandas.core.groupby.ops import WrappedCythonOp

kind = WrappedCythonOp.get_kind_from_how(how)
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

# libgroupby functions are responsible for NOT altering mask
mask = self._mask
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably add tests covering this at some point. Not sure if we already have them

if op.kind != "aggregate":
result_mask = mask.copy()
else:
result_mask = np.zeros(ngroups, dtype=bool)

res_values = op._cython_op_ndim_compat(
self._data,
min_count=min_count,
ngroups=ngroups,
comp_ids=ids,
mask=mask,
result_mask=result_mask,
**kwargs,
)

if op.how == "ohlc":
arity = op._cython_arity.get(op.how, 1)
result_mask = np.tile(result_mask, (arity, 1)).T

# res_values should already have the correct dtype, we just need to
# wrap in a MaskedArray
return self._maybe_mask_result(res_values, result_mask)
15 changes: 15 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1806,6 +1806,21 @@ def _formatter(self, boxed: bool = False):
# This will infer the correct formatter from the dtype of the values.
return None

# ------------------------------------------------------------------------
# GroupBy Methods

def _groupby_op(
self,
*,
how: str,
has_dropped_na: bool,
min_count: int,
ngroups: int,
ids: npt.NDArray[np.intp],
**kwargs,
):
raise NotImplementedError(f"{self.dtype} dtype not supported")


def _make_sparse(
arr: np.ndarray,
Expand Down
Loading