Skip to content

REF: implement groupby std, min, max as EA methods #51116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 75 additions & 1 deletion pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@

import numpy as np

from pandas._libs import lib
from pandas._libs import (
groupby as libgroupby,
lib,
)
from pandas.compat import set_function_name
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
Expand All @@ -43,6 +46,7 @@
is_datetime64_dtype,
is_dtype_equal,
is_list_like,
is_object_dtype,
is_scalar,
is_timedelta64_dtype,
pandas_dtype,
Expand Down Expand Up @@ -1722,6 +1726,76 @@ def map(self, mapper, na_action=None):
"""
return map_array(self, mapper, na_action=na_action)

# ------------------------------------------------------------------------
# GroupBy Methods

def _groupby_any_all(
self,
*,
ngroups: int,
ids: npt.NDArray[np.intp],
val_test: Literal["any", "all"],
skipna: bool,
):
ncols = 1 if self.ndim == 1 else self.shape[0]
result = np.zeros((ngroups, ncols), dtype=np.int8)

obj = self
if is_object_dtype(self.dtype) and skipna:
# GH#37501: don't raise on pd.NA when skipna=True
self_mask = self.isna()
if self_mask.any():
# mask on original values computed separately
obj = obj.copy()
obj[self_mask] = True

vals = obj.astype(bool, copy=False).view(np.int8)
mask = np.asarray(self.isna()).view(np.uint8)

# Call func to modify result in place
libgroupby.group_any_all(
out=result,
labels=ids,
values=np.atleast_2d(vals).T,
mask=np.atleast_2d(mask).T,
val_test=val_test,
skipna=skipna,
nullable=False,
)

if self.ndim == 1:
assert result.shape[1] == 1, result.shape
result = result[:, 0]

result = result.astype(bool, copy=False)
return result.T

def _groupby_std(self, *, ngroups: int, ids: npt.NDArray[np.intp], ddof: int):
cython_dtype = np.dtype(np.float64)

ncols = 1 if self.ndim == 1 else self.shape[0]

result = np.zeros((ngroups, ncols), dtype=cython_dtype)
counts = np.zeros(ngroups, dtype=np.int64)

vals = self.astype(cython_dtype, copy=False)

# Call func to modify result in place
libgroupby.group_var(
out=result,
labels=ids,
values=np.atleast_2d(vals).T,
counts=counts,
ddof=ddof,
)

if self.ndim == 1:
assert result.shape[1] == 1, result.shape
result = result[:, 0]

result = np.sqrt(result)
return result.T


class ExtensionArraySupportsAnyAll(ExtensionArray):
def any(self, *, skipna: bool = True) -> bool:
Expand Down
34 changes: 34 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from pandas._libs import (
algos,
groupby as libgroupby,
lib,
)
from pandas._libs.arrays import NDArrayBacked
Expand Down Expand Up @@ -2104,6 +2105,39 @@ def factorize(
# FIXME: shouldn't get here; we are ignoring sort
return super().factorize(use_na_sentinel=use_na_sentinel)

def _groupby_std(self, *, ngroups: int, ids: npt.NDArray[np.intp], ddof: int):
cython_dtype = np.dtype(np.float64)

ncols = 1 if self.ndim == 1 else self.shape[0]

result = np.zeros((ngroups, ncols), dtype=cython_dtype)
counts = np.zeros(ngroups, dtype=np.int64)

vals = self.view("i8").astype(cython_dtype, copy=False)

# Call func to modify result in place
libgroupby.group_var(
out=result,
labels=ids,
values=np.atleast_2d(vals).T,
counts=counts,
ddof=ddof,
is_datetimelike=True,
)

if self.ndim == 1:
assert result.shape[1] == 1, result.shape
result = result[:, 0]

result = np.sqrt(result)

with warnings.catch_warnings():
# suppress "RuntimeWarning: invalid value encountered in cast"
warnings.filterwarnings("ignore")
result = result.astype(np.int64, copy=False)
result = result.view(f"m8[{self.unit}]")
return result.T


# -------------------------------------------------------------------
# Shared Constructor Helpers
Expand Down
62 changes: 62 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import numpy as np

from pandas._libs import (
groupby as libgroupby,
lib,
missing as libmissing,
)
Expand Down Expand Up @@ -1382,3 +1383,64 @@ def _accumulate(
data, mask = op(data, mask, skipna=skipna, **kwargs)

return type(self)(data, mask, copy=False)

# ------------------------------------------------------------------------
# GroupBy Methods

def _groupby_any_all(
self,
*,
ngroups: int,
ids: npt.NDArray[np.intp],
val_test: Literal["any", "all"],
skipna: bool,
):
from pandas.core.arrays import BooleanArray

result = np.zeros(ngroups, dtype=np.int8).reshape(-1, 1)
vals = self._data.astype(bool, copy=False).view(np.int8).reshape(-1, 1)
mask = self.isna().view(np.uint8).reshape(-1, 1)

# Call func to modify result in place
libgroupby.group_any_all(
out=result,
labels=ids,
values=vals,
mask=mask,
val_test=val_test,
skipna=skipna,
nullable=True,
)

assert result.shape[1] == 1, result.shape
result = result[:, 0]

return BooleanArray(result.astype(bool, copy=False), result == -1)

def _groupby_std(self, *, ngroups: int, ids: npt.NDArray[np.intp], ddof: int):
from pandas.core.arrays import FloatingArray

result = np.zeros(ngroups, dtype=np.float64).reshape(-1, 1)
counts = np.zeros(ngroups, dtype=np.int64)
vals = self._data.astype(np.float64, copy=False).reshape(-1, 1)
mask = self.isna().view(np.uint8).reshape(-1, 1)
result_mask = np.zeros(result.shape, dtype=np.bool_)

# Call func to modify result in place
libgroupby.group_var(
out=result,
labels=ids,
values=vals,
mask=mask,
counts=counts,
result_mask=result_mask,
ddof=ddof,
)

assert result.shape[1] == 1, result.shape
result = result[:, 0]

assert result_mask.shape[1] == 1, result_mask.shape
result_mask = result_mask[:, 0]

return FloatingArray(np.sqrt(result), result_mask.view(np.bool_))
Loading