Skip to content

REF: let EAs override WrappedCythonOp groupby implementations #51166

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1728,6 +1728,81 @@ def map(self, mapper, na_action=None):
"""
return map_array(self, mapper, na_action=na_action)

# ------------------------------------------------------------------------
# GroupBy Methods

def _groupby_op(
self,
*,
how: str,
has_dropped_na: bool,
min_count: int,
ngroups: int,
ids: npt.NDArray[np.intp],
**kwargs,
) -> ArrayLike:
"""
Dispatch GroupBy reduction or transformation operation.

This is an *experimental* API to allow ExtensionArray authors to implement
reductions and transformations. The API is subject to change.

Parameters
----------
how : {'sum', 'prod', 'min', 'max', 'mean', 'median',
'median', 'var', 'nth', 'last', 'ohlc',
'cumprod', 'cumsum', 'cummin', 'cummax', 'rank'}
has_dropped_na : bool
min_count : int
ngroups : int
ids : np.ndarray[np.intp]
ids[i] gives the integer label for the group that self[i] belongs to.
**kwargs : operation-specific
'var' -> ['ddof']
'cumprod', 'cumsum', 'cummin', 'cummax' -> ['skipna']
'rank' -> ['ties_method', 'ascending', 'na_option', 'pct']

Returns
-------
np.ndarray or ExtensionArray
"""
from pandas.core.arrays.string_ import StringDtype
from pandas.core.groupby.ops import WrappedCythonOp

kind = WrappedCythonOp.get_kind_from_how(how)
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

# GH#43682
if isinstance(self.dtype, StringDtype):
# StringArray
npvalues = self.to_numpy(object, na_value=np.nan)
else:
raise NotImplementedError(
f"function is not implemented for this dtype: {self.dtype}"
)

res_values = op._cython_op_ndim_compat(
npvalues,
min_count=min_count,
ngroups=ngroups,
comp_ids=ids,
mask=None,
**kwargs,
)

if op.how in op.cast_blocklist:
# i.e. how in ["rank"], since other cast_blocklist methods don't go
# through cython_operation
return res_values

if isinstance(self.dtype, StringDtype):
dtype = self.dtype
string_array_cls = dtype.construct_array_type()
return string_array_cls._from_sequence(res_values, dtype=dtype)

else:
raise NotImplementedError
Comment on lines +1793 to +1799
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hasn't this if-then already been done above? or is it just to future-proof the code?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

future-proof is a nice way of putting it, yes. this is transplanted from its current position in WrappedCythonOp where the redundant checks are in separate methods



class ExtensionArraySupportsAnyAll(ExtensionArray):
def any(self, *, skipna: bool = True) -> bool:
Expand Down
48 changes: 48 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2357,6 +2357,54 @@ def _str_get_dummies(self, sep: str = "|"):

return PandasArray(self.astype(str))._str_get_dummies(sep)

# ------------------------------------------------------------------------
# GroupBy Methods

def _groupby_op(
self,
*,
how: str,
has_dropped_na: bool,
min_count: int,
ngroups: int,
ids: npt.NDArray[np.intp],
**kwargs,
):
from pandas.core.groupby.ops import WrappedCythonOp

kind = WrappedCythonOp.get_kind_from_how(how)
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

dtype = self.dtype
if how in ["sum", "prod", "cumsum", "cumprod"]:
raise TypeError(f"{dtype} type does not support {how} operations")
if how in ["min", "max", "rank"] and not dtype.ordered:
# raise TypeError instead of NotImplementedError to ensure we
# don't go down a group-by-group path, since in the empty-groups
# case that would fail to raise
raise TypeError(f"Cannot perform {how} with non-ordered Categorical")
if how not in ["rank"]:
# only "rank" is implemented in cython
raise NotImplementedError(f"{dtype} dtype not supported")

assert how == "rank" # the only one implemented ATM
assert self.ordered # checked earlier
mask = self.isna()
npvalues = self._ndarray

res_values = op._cython_op_ndim_compat(
npvalues,
min_count=min_count,
ngroups=ngroups,
comp_ids=ids,
mask=mask,
**kwargs,
)

# If we ever have more than just "rank" here, we'll need to do
# `if op.how in op.cast_blocklist` like we do for other dtypes.
return res_values


# The Series.cat accessor

Expand Down
55 changes: 55 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1575,6 +1575,61 @@ def _mode(self, dropna: bool = True):
npmodes = cast(np.ndarray, npmodes)
return self._from_backing_data(npmodes)

# ------------------------------------------------------------------
# GroupBy Methods

def _groupby_op(
self,
*,
how: str,
has_dropped_na: bool,
min_count: int,
ngroups: int,
ids: npt.NDArray[np.intp],
**kwargs,
):
dtype = self.dtype
if dtype.kind == "M":
# Adding/multiplying datetimes is not valid
if how in ["sum", "prod", "cumsum", "cumprod"]:
raise TypeError(f"datetime64 type does not support {how} operations")
elif is_period_dtype(dtype):
# Adding/multiplying Periods is not valid
if how in ["sum", "prod", "cumsum", "cumprod"]:
raise TypeError(f"Period type does not support {how} operations")
else:
# timedeltas we can add but not multiply
if how in ["prod", "cumprod"]:
raise TypeError(f"timedelta64 type does not support {how} operations")

# All of the functions implemented here are ordinal, so we can
# operate on the tz-naive equivalents
npvalues = self._ndarray.view("M8[ns]")

from pandas.core.groupby.ops import WrappedCythonOp

kind = WrappedCythonOp.get_kind_from_how(how)
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

res_values = op._cython_op_ndim_compat(
npvalues,
min_count=min_count,
ngroups=ngroups,
comp_ids=ids,
mask=None,
**kwargs,
)

if op.how in op.cast_blocklist:
# i.e. how in ["rank"], since other cast_blocklist methods don't go
# through cython_operation
return res_values

# We did a view to M8[ns] above, now we go the other direction
assert res_values.dtype == "M8[ns]"
res_values = res_values.view(self._ndarray.dtype)
return self._from_backing_data(res_values)


class DatelikeOps(DatetimeLikeArrayMixin):
"""
Expand Down
43 changes: 43 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -1389,3 +1389,46 @@ def _accumulate(
data, mask = op(data, mask, skipna=skipna, **kwargs)

return type(self)(data, mask, copy=False)

# ------------------------------------------------------------------
# GroupBy Methods

def _groupby_op(
self,
*,
how: str,
has_dropped_na: bool,
min_count: int,
ngroups: int,
ids: npt.NDArray[np.intp],
**kwargs,
):
from pandas.core.groupby.ops import WrappedCythonOp

kind = WrappedCythonOp.get_kind_from_how(how)
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

# libgroupby functions are responsible for NOT altering mask
mask = self._mask
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably add tests covering this at some point. Not sure if we already have them

if op.kind != "aggregate":
result_mask = mask.copy()
else:
result_mask = np.zeros(ngroups, dtype=bool)

res_values = op._cython_op_ndim_compat(
self._data,
min_count=min_count,
ngroups=ngroups,
comp_ids=ids,
mask=mask,
result_mask=result_mask,
**kwargs,
)

if op.how == "ohlc":
arity = op._cython_arity.get(op.how, 1)
result_mask = np.tile(result_mask, (arity, 1)).T

# res_values should already have the correct dtype, we just need to
# wrap in a MaskedArray
return self._maybe_mask_result(res_values, result_mask)
15 changes: 15 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1811,6 +1811,21 @@ def _formatter(self, boxed: bool = False):
# This will infer the correct formatter from the dtype of the values.
return None

# ------------------------------------------------------------------------
# GroupBy Methods

def _groupby_op(
self,
*,
how: str,
has_dropped_na: bool,
min_count: int,
ngroups: int,
ids: npt.NDArray[np.intp],
**kwargs,
):
raise NotImplementedError(f"{self.dtype} dtype not supported")


def _make_sparse(
arr: np.ndarray,
Expand Down
Loading