Skip to content

ENH: enable skipna on groupby reduction ops #43671

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ Other enhancements
- Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`)
- :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`)
- :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
- :meth:`.GroupBy.cummin`, :meth:`.GroupBy.cummax`, :meth:`.GroupBy.sum`, and :meth:`.GroupBy.mean` now support the argument ``skipna`` (:issue:`34047`)
- :meth:`read_table` now supports the argument ``storage_options`` (:issue:`39167`)
- :meth:`DataFrame.to_stata` and :meth:`StataWriter` now accept the keyword only argument ``value_labels`` to save labels for non-categorical columns
- Methods that relied on hashmap based algos such as :meth:`DataFrameGroupBy.value_counts`, :meth:`DataFrameGroupBy.count` and :func:`factorize` ignored imaginary component for complex numbers (:issue:`17927`)
Expand Down
26 changes: 24 additions & 2 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,8 @@ def group_add(add_t[:, ::1] out,
int64_t[::1] counts,
ndarray[add_t, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=0) -> None:
Py_ssize_t min_count=0,
bint skipna=True) -> None:
"""
Only aggregates on axis=0 using Kahan summation
"""
Expand Down Expand Up @@ -538,6 +539,8 @@ def group_add(add_t[:, ::1] out,
else:
t = sumx[lab, j] + val
sumx[lab, j] = t
elif not skipna:
sumx[lab, j] += val

for i in range(ncounts):
for j in range(K):
Expand All @@ -563,6 +566,10 @@ def group_add(add_t[:, ::1] out,
t = sumx[lab, j] + y
compensation[lab, j] = t - sumx[lab, j] - y
sumx[lab, j] = t
# don't skip nan
elif not skipna:
sumx[lab, j] = NAN
break

for i in range(ncounts):
for j in range(K):
Expand All @@ -578,7 +585,8 @@ def group_prod(floating[:, ::1] out,
int64_t[::1] counts,
ndarray[floating, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=0) -> None:
Py_ssize_t min_count=0,
bint skipna=True) -> None:
"""
Only aggregates on axis=0
"""
Expand Down Expand Up @@ -611,6 +619,10 @@ def group_prod(floating[:, ::1] out,
if val == val:
nobs[lab, j] += 1
prodx[lab, j] *= val
# don't skip nan
elif not skipna:
prodx[lab, j] = NAN
break

for i in range(ncounts):
for j in range(K):
Expand All @@ -628,6 +640,7 @@ def group_var(floating[:, ::1] out,
ndarray[floating, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint skipna=True,
int64_t ddof=1) -> None:
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
Expand Down Expand Up @@ -681,6 +694,10 @@ def group_mean(mean_t[:, ::1] out,
int64_t[::1] counts,
ndarray[mean_t, ndim=2] values,
const intp_t[::1] labels,
<<<<<<< HEAD
bint skipna=True,
Py_ssize_t min_count=-1) -> None:
=======
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
Expand Down Expand Up @@ -717,6 +734,7 @@ def group_mean(mean_t[:, ::1] out,
`counts` is modified to hold group sizes
"""

>>>>>>> 2e29e1172bb5d17c5d6f4d8bec1d3e6452091822
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
mean_t val, count, y, t, nan_val
Expand Down Expand Up @@ -753,6 +771,10 @@ def group_mean(mean_t[:, ::1] out,
t = sumx[lab, j] + y
compensation[lab, j] = t - sumx[lab, j] - y
sumx[lab, j] = t
# don't skip nan
elif not skipna:
sumx[lab, j] = NAN
break

for i in range(ncounts):
for j in range(K):
Expand Down
50 changes: 50 additions & 0 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,56 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame:
output = self._reindex_output(output)
return output

<<<<<<< HEAD
def _cython_agg_general(
self,
how: str,
alt: Callable,
numeric_only: bool,
min_count: int = -1,
skipna: bool = True,
):

obj = self._selected_obj
objvals = obj._values
data = obj._mgr

if numeric_only and not is_numeric_dtype(obj.dtype):
# GH#41291 match Series behavior
raise NotImplementedError(
f"{type(self).__name__}.{how} does not implement numeric_only."
)

# This is overkill because it is only called once, but is here to
# mirror the array_func used in DataFrameGroupBy._cython_agg_general
def array_func(values: ArrayLike) -> ArrayLike:
try:
result = self.grouper._cython_operation(
"aggregate",
values,
how,
axis=data.ndim - 1,
min_count=min_count,
skipna=skipna,
)
except NotImplementedError:
# generally if we have numeric_only=False
# and non-applicable functions
# try to python agg
# TODO: shouldn't min_count matter?
result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)

return result

result = array_func(objvals)

ser = self.obj._constructor(
result, index=self.grouper.result_index, name=obj.name
)
return self._reindex_output(ser)

=======
>>>>>>> 2e29e1172bb5d17c5d6f4d8bec1d3e6452091822
def _indexed_output_to_ndframe(
self, output: Mapping[base.OutputKey, ArrayLike]
) -> Series:
Expand Down
26 changes: 22 additions & 4 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1452,6 +1452,7 @@ def _agg_general(
*,
alias: str,
npfunc: Callable,
skipna=True,
):

with self._group_selection_context():
Expand All @@ -1461,6 +1462,7 @@ def _agg_general(
alt=npfunc,
numeric_only=numeric_only,
min_count=min_count,
skipna=skipna,
)
return result.__finalize__(self.obj, method="groupby")

Expand Down Expand Up @@ -1506,7 +1508,12 @@ def _agg_py_fallback(

@final
def _cython_agg_general(
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
self,
how: str,
alt: Callable,
numeric_only: bool,
min_count: int = -1,
skipna: bool = False,
):
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
# that goes through SeriesGroupBy
Expand Down Expand Up @@ -2028,7 +2035,10 @@ def size(self) -> DataFrame | Series:
@final
@doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
def sum(
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
self,
numeric_only: bool | lib.NoDefault = lib.no_default,
min_count: int = 0,
skipna: bool = True,
):
numeric_only = self._resolve_numeric_only(numeric_only)

Expand All @@ -2041,19 +2051,27 @@ def sum(
min_count=min_count,
alias="add",
npfunc=np.sum,
skipna=skipna,
)

return self._reindex_output(result, fill_value=0)

@final
@doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
def prod(
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
self,
numeric_only: bool | lib.NoDefault = lib.no_default,
min_count: int = 0,
skipna: bool = True,
):
numeric_only = self._resolve_numeric_only(numeric_only)

return self._agg_general(
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
numeric_only=numeric_only,
min_count=min_count,
alias="prod",
npfunc=np.prod,
skipna=skipna,
)

@final
Expand Down
30 changes: 27 additions & 3 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ def _ea_wrap_cython_operation(
min_count: int,
ngroups: int,
comp_ids: np.ndarray,
skipna: bool = True,
**kwargs,
) -> ArrayLike:
"""
Expand All @@ -336,6 +337,7 @@ def _ea_wrap_cython_operation(
min_count=min_count,
ngroups=ngroups,
comp_ids=comp_ids,
skipna=skipna,
**kwargs,
)

Expand Down Expand Up @@ -363,6 +365,7 @@ def _ea_wrap_cython_operation(
ngroups=ngroups,
comp_ids=comp_ids,
mask=None,
skipna=skipna,
**kwargs,
)

Expand Down Expand Up @@ -399,6 +402,7 @@ def _masked_ea_wrap_cython_operation(
min_count: int,
ngroups: int,
comp_ids: np.ndarray,
skipna: bool = True,
**kwargs,
) -> BaseMaskedArray:
"""
Expand All @@ -419,6 +423,7 @@ def _masked_ea_wrap_cython_operation(
comp_ids=comp_ids,
mask=mask,
result_mask=result_mask,
skipna=skipna,
**kwargs,
)

Expand All @@ -441,6 +446,7 @@ def _cython_op_ndim_compat(
comp_ids: np.ndarray,
mask: np.ndarray | None = None,
result_mask: np.ndarray | None = None,
skipna: bool = True,
**kwargs,
) -> np.ndarray:
if values.ndim == 1:
Expand All @@ -457,6 +463,7 @@ def _cython_op_ndim_compat(
comp_ids=comp_ids,
mask=mask,
result_mask=result_mask,
skipna=skipna,
**kwargs,
)
if res.shape[0] == 1:
Expand All @@ -472,6 +479,7 @@ def _cython_op_ndim_compat(
comp_ids=comp_ids,
mask=mask,
result_mask=result_mask,
skipna=skipna,
**kwargs,
)

Expand All @@ -485,6 +493,7 @@ def _call_cython_op(
comp_ids: np.ndarray,
mask: np.ndarray | None,
result_mask: np.ndarray | None,
skipna: bool = True,
**kwargs,
) -> np.ndarray: # np.ndarray[ndim=2]
orig_values = values
Expand Down Expand Up @@ -530,9 +539,10 @@ def _call_cython_op(
mask=mask,
result_mask=result_mask,
is_datetimelike=is_datetimelike,
skipna=skipna,
)
else:
func(result, counts, values, comp_ids, min_count)
func(result, counts, values, comp_ids, min_count, skipna=skipna)
else:
# TODO: min_count
if self.uses_mask():
Expand All @@ -543,10 +553,19 @@ def _call_cython_op(
ngroups,
is_datetimelike,
mask=mask,
skipna=skipna,
**kwargs,
)
else:
func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
func(
result,
values,
comp_ids,
ngroups,
is_datetimelike,
skipna=skipna,
**kwargs,
)

if self.kind == "aggregate":
# i.e. counts is defined. Locations where count<min_count
Expand Down Expand Up @@ -580,9 +599,10 @@ def cython_operation(
*,
values: ArrayLike,
axis: int,
min_count: int = -1,
comp_ids: np.ndarray,
ngroups: int,
min_count: int = -1,
skipna: bool = True,
**kwargs,
) -> ArrayLike:
"""
Expand Down Expand Up @@ -611,6 +631,7 @@ def cython_operation(
min_count=min_count,
ngroups=ngroups,
comp_ids=comp_ids,
skipna=skipna,
**kwargs,
)

Expand All @@ -620,6 +641,7 @@ def cython_operation(
ngroups=ngroups,
comp_ids=comp_ids,
mask=None,
skipna=skipna,
**kwargs,
)

Expand Down Expand Up @@ -914,6 +936,7 @@ def _cython_operation(
how: str,
axis: int,
min_count: int = -1,
skipna: bool = True,
**kwargs,
) -> ArrayLike:
"""
Expand All @@ -931,6 +954,7 @@ def _cython_operation(
min_count=min_count,
comp_ids=ids,
ngroups=ngroups,
skipna=skipna,
**kwargs,
)

Expand Down