Skip to content

ENH: Support skipna parameter in GroupBy min, max, prod, median, var, std and sem methods #60752

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 3, 2025
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``prod``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def group_prod(
mask: np.ndarray | None,
result_mask: np.ndarray | None = ...,
min_count: int = ...,
skipna: bool = ...,
) -> None: ...
def group_var(
out: np.ndarray, # floating[:, ::1]
Expand All @@ -88,6 +89,7 @@ def group_var(
result_mask: np.ndarray | None = ...,
is_datetimelike: bool = ...,
name: str = ...,
skipna: bool = ...,
) -> None: ...
def group_skew(
out: np.ndarray, # float64_t[:, ::1]
Expand Down
39 changes: 38 additions & 1 deletion pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -806,13 +806,14 @@ def group_prod(
const uint8_t[:, ::1] mask,
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=0,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
int64float_t val
int64float_t val, nan_val
int64float_t[:, ::1] prodx
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
Expand All @@ -825,6 +826,13 @@ def group_prod(
prodx = np.ones((<object>out).shape, dtype=(<object>out).base.dtype)

N, K = (<object>values).shape
if uses_mask:
nan_val = 0
elif int64float_t is int64_t or int64float_t is uint64_t:
# This has no effect as int64 can't be nan. Setting to 0 to avoid type error
nan_val = 0
else:
nan_val = NAN

with nogil:
for i in range(N):
Expand All @@ -836,6 +844,13 @@ def group_prod(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
_treat_as_na(prodx[lab, j], False)
):
# If prod is already NA, no need to update it
continue

if uses_mask:
isna_entry = mask[i, j]
else:
Expand All @@ -844,6 +859,11 @@ def group_prod(
if not isna_entry:
nobs[lab, j] += 1
prodx[lab, j] *= val
elif not skipna:
if uses_mask:
result_mask[lab, j] = True
else:
prodx[lab, j] = nan_val

_check_below_mincount(
out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
Expand All @@ -864,6 +884,7 @@ def group_var(
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
str name="var",
bint skipna=True,
) -> None:
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
Expand Down Expand Up @@ -898,6 +919,16 @@ def group_var(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and out[lab, j] == NPY_NAT) or
_treat_as_na(out[lab, j], False)
):
# If aggregate is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in a NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
elif is_datetimelike:
Expand All @@ -913,6 +944,12 @@ def group_var(
oldmean = mean[lab, j]
mean[lab, j] += (val - oldmean) / nobs[lab, j]
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
elif not skipna:
nobs[lab, j] = 0
if uses_mask:
result_mask[lab, j] = True
else:
out[lab, j] = NAN

for i in range(ncounts):
for j in range(K):
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/_numba/kernels/var_.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def grouped_var(
ngroups: int,
min_periods: int,
ddof: int = 1,
skipna: bool = True,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)

Expand All @@ -190,7 +191,15 @@ def grouped_var(
lab = labels[i]
val = values[i]

if lab < 0:
if lab < 0 or np.isnan(output[lab]):
continue

if not skipna and np.isnan(val):
output[lab] = np.nan
nobs_arr[lab] += 1
comp_arr[lab] = np.nan
consecutive_counts[lab] = 1
prev_vals[lab] = np.nan
continue

mean_x = means[lab]
Expand Down
51 changes: 45 additions & 6 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2349,6 +2349,7 @@ def std(
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
numeric_only: bool = False,
skipna: bool = True,
):
"""
Compute standard deviation of groups, excluding missing values.
Expand Down Expand Up @@ -2387,6 +2388,12 @@ def std(

numeric_only now defaults to ``False``.

skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.

.. versionadded:: 3.0.0

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -2441,14 +2448,16 @@ def std(
engine_kwargs,
min_periods=0,
ddof=ddof,
skipna=skipna,
)
)
else:
return self._cython_agg_general(
"std",
alt=lambda x: Series(x, copy=False).std(ddof=ddof),
alt=lambda x: Series(x, copy=False).std(ddof=ddof, skipna=skipna),
numeric_only=numeric_only,
ddof=ddof,
skipna=skipna,
)

@final
Expand All @@ -2460,6 +2469,7 @@ def var(
engine: Literal["cython", "numba"] | None = None,
engine_kwargs: dict[str, bool] | None = None,
numeric_only: bool = False,
skipna: bool = True,
):
"""
Compute variance of groups, excluding missing values.
Expand Down Expand Up @@ -2497,6 +2507,12 @@ def var(

numeric_only now defaults to ``False``.

skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.

.. versionadded:: 3.0.0

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -2550,13 +2566,15 @@ def var(
engine_kwargs,
min_periods=0,
ddof=ddof,
skipna=skipna,
)
else:
return self._cython_agg_general(
"var",
alt=lambda x: Series(x, copy=False).var(ddof=ddof),
alt=lambda x: Series(x, copy=False).var(ddof=ddof, skipna=skipna),
numeric_only=numeric_only,
ddof=ddof,
skipna=skipna,
)

@final
Expand Down Expand Up @@ -2686,7 +2704,9 @@ def _value_counts(
return result.__finalize__(self.obj, method="value_counts")

@final
def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:
def sem(
self, ddof: int = 1, numeric_only: bool = False, skipna: bool = True
) -> NDFrameT:
"""
Compute standard error of the mean of groups, excluding missing values.

Expand All @@ -2706,6 +2726,12 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:

numeric_only now defaults to ``False``.

skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.

.. versionadded:: 3.0.0

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -2780,9 +2806,10 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:
)
return self._cython_agg_general(
"sem",
alt=lambda x: Series(x, copy=False).sem(ddof=ddof),
alt=lambda x: Series(x, copy=False).sem(ddof=ddof, skipna=skipna),
numeric_only=numeric_only,
ddof=ddof,
skipna=skipna,
)

@final
Expand Down Expand Up @@ -2959,7 +2986,9 @@ def sum(
return result

@final
def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT:
def prod(
self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True
) -> NDFrameT:
"""
Compute prod of group values.

Expand All @@ -2976,6 +3005,12 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT:
The required number of valid values to perform the operation. If fewer
than ``min_count`` non-NA values are present the result will be NA.

skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.

.. versionadded:: 3.0.0

Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -3024,7 +3059,11 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT:
2 30 72
"""
return self._agg_general(
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
numeric_only=numeric_only,
min_count=min_count,
skipna=skipna,
alias="prod",
npfunc=np.prod,
)

@final
Expand Down
51 changes: 50 additions & 1 deletion pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -1440,12 +1440,61 @@ def var(
return self._downsample("var", ddof=ddof, numeric_only=numeric_only)

@final
@doc(GroupBy.sem)
def sem(
self,
ddof: int = 1,
numeric_only: bool = False,
):
"""
Compute standard error of the mean of groups, excluding missing values.

For multiple groupings, the result index will be a MultiIndex.

Parameters
----------
ddof : int, default 1
Degrees of freedom.

numeric_only : bool, default False
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

.. versionchanged:: 2.0.0

numeric_only now defaults to ``False``.

Returns
-------
Series or DataFrame
Standard error of the mean of values within each group.

See Also
--------
DataFrame.sem : Return unbiased standard error of the mean over requested axis.
Series.sem : Return unbiased standard error of the mean over requested axis.

Examples
--------

>>> ser = pd.Series(
... [1, 3, 2, 4, 3, 8],
... index=pd.DatetimeIndex(
... [
... "2023-01-01",
... "2023-01-10",
... "2023-01-15",
... "2023-02-01",
... "2023-02-10",
... "2023-02-15",
... ]
... ),
... )
>>> ser.resample("MS").sem()
2023-01-01 0.577350
2023-02-01 1.527525
Freq: MS, dtype: float64
"""
return self._downsample("sem", ddof=ddof, numeric_only=numeric_only)

@final
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/aggregate/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("func", ["sum", "mean"])
@pytest.mark.parametrize("func", ["sum", "mean", "var", "std"])
def test_multifunc_numba_vs_cython_frame_noskipna(func):
pytest.importorskip("numba")
data = DataFrame(
Expand Down
Loading
Loading