Skip to content

ENH: Support skipna parameter in GroupBy min, max, prod, median, var, std and sem methods #60752

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 3, 2025
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def group_median_float64(
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
is_datetimelike: bool = ..., # bint
skipna: bool = ...,
) -> None: ...
def group_cumprod(
out: np.ndarray, # float64_t[:, ::1]
Expand Down Expand Up @@ -76,6 +77,7 @@ def group_prod(
mask: np.ndarray | None,
result_mask: np.ndarray | None = ...,
min_count: int = ...,
skipna: bool = ...,
) -> None: ...
def group_var(
out: np.ndarray, # floating[:, ::1]
Expand All @@ -88,6 +90,7 @@ def group_var(
result_mask: np.ndarray | None = ...,
is_datetimelike: bool = ...,
name: str = ...,
skipna: bool = ...,
) -> None: ...
def group_skew(
out: np.ndarray, # float64_t[:, ::1]
Expand Down Expand Up @@ -183,6 +186,7 @@ def group_max(
is_datetimelike: bool = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_min(
out: np.ndarray, # groupby_t[:, ::1]
Expand All @@ -193,6 +197,7 @@ def group_min(
is_datetimelike: bool = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_idxmin_idxmax(
out: npt.NDArray[np.intp],
Expand Down
99 changes: 77 additions & 22 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,12 @@ cdef enum InterpolationEnumType:
INTERPOLATION_MIDPOINT


cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil:
cdef float64_t median_linear_mask(
float64_t* a,
int n,
uint8_t* mask,
bint skipna=True
) noexcept nogil:
cdef:
int i, j, na_count = 0
float64_t* tmp
Expand All @@ -77,7 +82,7 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
na_count += 1

if na_count:
if na_count == n:
if na_count == n or not skipna:
return NaN

tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
Expand All @@ -104,7 +109,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
cdef float64_t median_linear(
float64_t* a,
int n,
bint is_datetimelike=False
bint is_datetimelike=False,
bint skipna=True,
) noexcept nogil:
cdef:
int i, j, na_count = 0
Expand All @@ -125,7 +131,7 @@ cdef float64_t median_linear(
na_count += 1

if na_count:
if na_count == n:
if na_count == n or not skipna:
return NaN

tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
Expand Down Expand Up @@ -186,6 +192,7 @@ def group_median_float64(
const uint8_t[:, :] mask=None,
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -229,7 +236,7 @@ def group_median_float64(

for j in range(ngroups):
size = _counts[j + 1]
result = median_linear_mask(ptr, size, ptr_mask)
result = median_linear_mask(ptr, size, ptr_mask, skipna)
out[j, i] = result

if result != result:
Expand All @@ -244,7 +251,7 @@ def group_median_float64(
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = median_linear(ptr, size, is_datetimelike)
out[j, i] = median_linear(ptr, size, is_datetimelike, skipna)
ptr += size


Expand Down Expand Up @@ -804,17 +811,18 @@ def group_prod(
const uint8_t[:, ::1] mask,
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=0,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
int64float_t val
int64float_t val, nan_val
int64float_t[:, ::1] prodx
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None

if len_values != len_labels:
raise ValueError("len(index) != len(labels)")
Expand All @@ -823,6 +831,7 @@ def group_prod(
prodx = np.ones((<object>out).shape, dtype=(<object>out).base.dtype)

N, K = (<object>values).shape
nan_val = _get_na_val(<int64float_t>0, False)

with nogil:
for i in range(N):
Expand All @@ -836,12 +845,23 @@ def group_prod(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
else:
isna_entry = _treat_as_na(val, False)
isna_result = _treat_as_na(prodx[lab, j], False)

if not skipna and isna_result:
# If prod is already NA, no need to update it
continue

if not isna_entry:
nobs[lab, j] += 1
prodx[lab, j] *= val
elif not skipna:
if uses_mask:
result_mask[lab, j] = True
else:
prodx[lab, j] = nan_val

_check_below_mincount(
out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
Expand All @@ -862,14 +882,15 @@ def group_var(
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
str name="var",
bint skipna=True,
) -> None:
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
floating val, ct, oldmean
floating[:, ::1] mean
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None
bint is_std = name == "std"
bint is_sem = name == "sem"

Expand Down Expand Up @@ -898,19 +919,34 @@ def group_var(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_var, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_entry = val == NPY_NAT
isna_result = out[lab, j] == NPY_NAT
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(out[lab, j], is_datetimelike)

if not skipna and isna_result:
# If aggregate is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in a NPY_NAT
continue

if not isna_entry:
nobs[lab, j] += 1
oldmean = mean[lab, j]
mean[lab, j] += (val - oldmean) / nobs[lab, j]
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
elif not skipna:
nobs[lab, j] = 0
if uses_mask:
result_mask[lab, j] = True
else:
out[lab, j] = NAN

for i in range(ncounts):
for j in range(K):
Expand Down Expand Up @@ -1164,7 +1200,7 @@ def group_mean(
mean_t[:, ::1] sumx, compensation
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None

assert min_count == -1, "'min_count' only used in sum and prod"

Expand Down Expand Up @@ -1194,25 +1230,24 @@ def group_mean(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and sumx[lab, j] == NPY_NAT) or
_treat_as_na(sumx[lab, j], False)
):
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_mean, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_entry = val == NPY_NAT
isna_result = sumx[lab, j] == NPY_NAT
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)

if not skipna and isna_result:
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -1806,6 +1841,7 @@ cdef group_min_max(
bint compute_max=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
):
"""
Compute minimum/maximum of columns of `values`, in row groups `labels`.
Expand Down Expand Up @@ -1833,6 +1869,8 @@ cdef group_min_max(
result_mask : ndarray[bool, ndim=2], optional
If not None, these specify locations in the output that are NA.
Modified in-place.
skipna : bool, default True
If True, ignore nans in `values`.

Notes
-----
Expand All @@ -1841,17 +1879,18 @@ cdef group_min_max(
"""
cdef:
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
numeric_t val
numeric_t val, nan_val
numeric_t[:, ::1] group_min_or_max
int64_t[:, ::1] nobs
bint uses_mask = mask is not None
bint isna_entry
bint isna_entry, isna_result

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

min_count = max(min_count, 1)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
nan_val = _get_na_val(<numeric_t>0, is_datetimelike)

group_min_or_max = np.empty_like(out)
group_min_or_max[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike)
Expand All @@ -1870,8 +1909,15 @@ cdef group_min_max(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(group_min_or_max[lab, j],
is_datetimelike)

if not skipna and isna_result:
# If current min/max is already NA, it will always be NA
continue

if not isna_entry:
nobs[lab, j] += 1
Expand All @@ -1881,6 +1927,11 @@ cdef group_min_max(
else:
if val < group_min_or_max[lab, j]:
group_min_or_max[lab, j] = val
elif not skipna:
if uses_mask:
result_mask[lab, j] = True
else:
group_min_or_max[lab, j] = nan_val

_check_below_mincount(
out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
Expand Down Expand Up @@ -2012,6 +2063,7 @@ def group_max(
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_min_max.__doc__"""
group_min_max(
Expand All @@ -2024,6 +2076,7 @@ def group_max(
compute_max=True,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand All @@ -2038,6 +2091,7 @@ def group_min(
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_min_max.__doc__"""
group_min_max(
Expand All @@ -2050,6 +2104,7 @@ def group_min(
compute_max=False,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand Down
8 changes: 6 additions & 2 deletions pandas/core/_numba/kernels/min_max_.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def grouped_min_max(
ngroups: int,
min_periods: int,
is_max: bool,
skipna: bool = True,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)
nobs = np.zeros(ngroups, dtype=np.int64)
Expand All @@ -97,13 +98,16 @@ def grouped_min_max(
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])):
continue

if values.dtype.kind == "i" or not np.isnan(val):
nobs[lab] += 1
else:
# NaN value cannot be a min/max value
if not skipna:
# If skipna is False and we encounter a NaN,
# both min and max of the group will be NaN
output[lab] = np.nan
continue

if nobs[lab] == 1:
Expand Down
Loading
Loading