Skip to content

ENH: Support skipna parameter in GroupBy min, max, prod, median, var, std and sem methods #60752

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 3, 2025
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``prod``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def group_max(
is_datetimelike: bool = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_min(
out: np.ndarray, # groupby_t[:, ::1]
Expand All @@ -195,6 +196,7 @@ def group_min(
is_datetimelike: bool = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_idxmin_idxmax(
out: npt.NDArray[np.intp],
Expand Down
81 changes: 49 additions & 32 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -815,7 +815,7 @@ def group_prod(
int64float_t[:, ::1] prodx
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None

if len_values != len_labels:
raise ValueError("len(index) != len(labels)")
Expand All @@ -842,17 +842,16 @@ def group_prod(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
_treat_as_na(prodx[lab, j], False)
):
# If prod is already NA, no need to update it
continue

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
else:
isna_entry = _treat_as_na(val, False)
isna_result = _treat_as_na(prodx[lab, j], False)

if not skipna and isna_result:
# If prod is already NA, no need to update it
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -890,7 +889,7 @@ def group_var(
floating[:, ::1] mean
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None
bint is_std = name == "std"
bint is_sem = name == "sem"

Expand All @@ -917,25 +916,24 @@ def group_var(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and out[lab, j] == NPY_NAT) or
_treat_as_na(out[lab, j], False)
):
# If aggregate is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in a NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_var, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_entry = val == NPY_NAT
isna_result = out[lab, j] == NPY_NAT
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(out[lab, j], is_datetimelike)

if not skipna and isna_result:
# If aggregate is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in a NPY_NAT
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -1201,7 +1199,7 @@ def group_mean(
mean_t[:, ::1] sumx, compensation
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None

assert min_count == -1, "'min_count' only used in sum and prod"

Expand Down Expand Up @@ -1231,25 +1229,24 @@ def group_mean(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and sumx[lab, j] == NPY_NAT) or
_treat_as_na(sumx[lab, j], False)
):
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_mean, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_entry = val == NPY_NAT
isna_result = sumx[lab, j] == NPY_NAT
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)

if not skipna and isna_result:
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -1843,6 +1840,7 @@ cdef group_min_max(
bint compute_max=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
):
"""
Compute minimum/maximum of columns of `values`, in row groups `labels`.
Expand Down Expand Up @@ -1870,6 +1868,8 @@ cdef group_min_max(
result_mask : ndarray[bool, ndim=2], optional
If not None, these specify locations in the output that are NA.
Modified in-place.
skipna : bool, default True
If True, ignore nans in `values`.

Notes
-----
Expand All @@ -1878,17 +1878,18 @@ cdef group_min_max(
"""
cdef:
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
numeric_t val
numeric_t val, nan_val
numeric_t[:, ::1] group_min_or_max
int64_t[:, ::1] nobs
bint uses_mask = mask is not None
bint isna_entry
bint isna_entry, isna_result

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

min_count = max(min_count, 1)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
nan_val = _get_na_val(<numeric_t>0, is_datetimelike)

group_min_or_max = np.empty_like(out)
group_min_or_max[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike)
Expand All @@ -1907,8 +1908,15 @@ cdef group_min_max(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(group_min_or_max[lab, j],
is_datetimelike)

if not skipna and isna_result:
# If current min/max is already NA, it will always be NA
continue

if not isna_entry:
nobs[lab, j] += 1
Expand All @@ -1918,6 +1926,11 @@ cdef group_min_max(
else:
if val < group_min_or_max[lab, j]:
group_min_or_max[lab, j] = val
elif not skipna:
if uses_mask:
result_mask[lab, j] = True
else:
group_min_or_max[lab, j] = nan_val

_check_below_mincount(
out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
Expand Down Expand Up @@ -2049,6 +2062,7 @@ def group_max(
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_min_max.__doc__"""
group_min_max(
Expand All @@ -2061,6 +2075,7 @@ def group_max(
compute_max=True,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand All @@ -2075,6 +2090,7 @@ def group_min(
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_min_max.__doc__"""
group_min_max(
Expand All @@ -2087,6 +2103,7 @@ def group_min(
compute_max=False,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand Down
10 changes: 7 additions & 3 deletions pandas/core/_numba/kernels/min_max_.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,15 @@ def sliding_min_max(
return output, na_pos


@numba.jit(nopython=True, nogil=True, parallel=False)
@numba.jit(nopython=True, nogil=False, parallel=False)
def grouped_min_max(
values: np.ndarray,
result_dtype: np.dtype,
labels: npt.NDArray[np.intp],
ngroups: int,
min_periods: int,
is_max: bool,
skipna: bool = True,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)
nobs = np.zeros(ngroups, dtype=np.int64)
Expand All @@ -97,13 +98,16 @@ def grouped_min_max(
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])):
continue

if values.dtype.kind == "i" or not np.isnan(val):
nobs[lab] += 1
else:
# NaN value cannot be a min/max value
if not skipna:
# If skipna is False and we encounter a NaN,
# both min and max of the group will be NaN
output[lab] = np.nan
continue

if nobs[lab] == 1:
Expand Down
12 changes: 10 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3068,12 +3068,13 @@ def prod(

@final
@doc(
_groupby_agg_method_engine_template,
_groupby_agg_method_skipna_engine_template,
fname="min",
no=False,
mc=-1,
e=None,
ek=None,
s=True,
example=dedent(
"""\
For SeriesGroupBy:
Expand Down Expand Up @@ -3113,6 +3114,7 @@ def min(
self,
numeric_only: bool = False,
min_count: int = -1,
skipna: bool = True,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
Expand All @@ -3125,23 +3127,26 @@ def min(
engine_kwargs,
min_periods=min_count,
is_max=False,
skipna=skipna,
)
else:
return self._agg_general(
numeric_only=numeric_only,
min_count=min_count,
skipna=skipna,
alias="min",
npfunc=np.min,
)

@final
@doc(
_groupby_agg_method_engine_template,
_groupby_agg_method_skipna_engine_template,
fname="max",
no=False,
mc=-1,
e=None,
ek=None,
s=True,
example=dedent(
"""\
For SeriesGroupBy:
Expand Down Expand Up @@ -3181,6 +3186,7 @@ def max(
self,
numeric_only: bool = False,
min_count: int = -1,
skipna: bool = True,
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
):
Expand All @@ -3193,11 +3199,13 @@ def max(
engine_kwargs,
min_periods=min_count,
is_max=True,
skipna=skipna,
)
else:
return self._agg_general(
numeric_only=numeric_only,
min_count=min_count,
skipna=skipna,
alias="max",
npfunc=np.max,
)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def test_frame_consistency(groupby_func):
elif groupby_func in ("nunique",):
exclude_expected = {"axis"}
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_expected = {"axis", "kwargs"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("sum", "mean", "std", "var"):
exclude_expected = {"axis", "kwargs"}
Expand Down Expand Up @@ -234,7 +234,7 @@ def test_series_consistency(request, groupby_func):
if groupby_func in ("any", "all"):
exclude_expected = {"kwargs", "bool_only", "axis"}
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_expected = {"axis", "kwargs"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("sum", "mean", "std", "var"):
exclude_expected = {"axis", "kwargs"}
Expand Down
Loading
Loading