pandas-dev · geoffrey-eisenbarth · Sep 16, 2021 · Sep 20, 2021 · Sep 20, 2021 · Sep 27, 2021
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -168,7 +168,7 @@ Other enhancements
 - Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`)
 - :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`)
 - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
-- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
+- :meth:`.GroupBy.cummin`, :meth:`.GroupBy.cummax`, :meth:`.GroupBy.sum`, and :meth:`.GroupBy.mean` now support the argument ``skipna`` (:issue:`34047`)
 - :meth:`read_table` now supports the argument ``storage_options`` (:issue:`39167`)
 - :meth:`DataFrame.to_stata` and :meth:`StataWriter` now accept the keyword only argument ``value_labels`` to save labels for non-categorical columns
 - Methods that relied on hashmap based algos such as :meth:`DataFrameGroupBy.value_counts`, :meth:`DataFrameGroupBy.count` and :func:`factorize` ignored imaginary component for complex numbers (:issue:`17927`)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -495,7 +495,8 @@ def group_add(add_t[:, ::1] out,
               int64_t[::1] counts,
               ndarray[add_t, ndim=2] values,
               const intp_t[::1] labels,
-              Py_ssize_t min_count=0) -> None:
+              Py_ssize_t min_count=0,
+              bint skipna=True) -> None:
     """
     Only aggregates on axis=0 using Kahan summation
     """
@@ -538,6 +539,8 @@ def group_add(add_t[:, ::1] out,
                     else:
                         t = sumx[lab, j] + val
                     sumx[lab, j] = t
+                elif not skipna:
+                    sumx[lab, j] += val
 
         for i in range(ncounts):
             for j in range(K):
@@ -563,6 +566,10 @@ def group_add(add_t[:, ::1] out,
                         t = sumx[lab, j] + y
                         compensation[lab, j] = t - sumx[lab, j] - y
                         sumx[lab, j] = t
+                    # don't skip nan
+                    elif not skipna:
+                        sumx[lab, j] = NAN
+                        break
 
             for i in range(ncounts):
                 for j in range(K):
@@ -578,7 +585,8 @@ def group_prod(floating[:, ::1] out,
                int64_t[::1] counts,
                ndarray[floating, ndim=2] values,
                const intp_t[::1] labels,
-               Py_ssize_t min_count=0) -> None:
+               Py_ssize_t min_count=0,
+               bint skipna=True) -> None:
     """
     Only aggregates on axis=0
     """
@@ -611,6 +619,10 @@ def group_prod(floating[:, ::1] out,
                 if val == val:
                     nobs[lab, j] += 1
                     prodx[lab, j] *= val
+                # don't skip nan
+                elif not skipna:
+                    prodx[lab, j] = NAN
+                    break
 
         for i in range(ncounts):
             for j in range(K):
@@ -628,6 +640,7 @@ def group_var(floating[:, ::1] out,
               ndarray[floating, ndim=2] values,
               const intp_t[::1] labels,
               Py_ssize_t min_count=-1,
+              bint skipna=True,
               int64_t ddof=1) -> None:
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
@@ -681,6 +694,10 @@ def group_mean(mean_t[:, ::1] out,
                int64_t[::1] counts,
                ndarray[mean_t, ndim=2] values,
                const intp_t[::1] labels,
+<<<<<<< HEAD
+               bint skipna=True,
+               Py_ssize_t min_count=-1) -> None:
+=======
                Py_ssize_t min_count=-1,
                bint is_datetimelike=False,
                const uint8_t[:, ::1] mask=None,
@@ -717,6 +734,7 @@ def group_mean(mean_t[:, ::1] out,
     `counts` is modified to hold group sizes
     """
 
+>>>>>>> 2e29e1172bb5d17c5d6f4d8bec1d3e6452091822
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         mean_t val, count, y, t, nan_val
@@ -753,6 +771,10 @@ def group_mean(mean_t[:, ::1] out,
                     t = sumx[lab, j] + y
                     compensation[lab, j] = t - sumx[lab, j] - y
                     sumx[lab, j] = t
+                # don't skip nan
+                elif not skipna:
+                    sumx[lab, j] = NAN
+                    break
 
         for i in range(ncounts):
             for j in range(K):

@@ -337,6 +337,56 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame:
         output = self._reindex_output(output)
         return output
 
+<<<<<<< HEAD
+    def _cython_agg_general(
+        self,
+        how: str,
+        alt: Callable,
+        numeric_only: bool,
+        min_count: int = -1,
+        skipna: bool = True,
+    ):
+
+        obj = self._selected_obj
+        objvals = obj._values
+        data = obj._mgr
+
+        if numeric_only and not is_numeric_dtype(obj.dtype):
+            # GH#41291 match Series behavior
+            raise NotImplementedError(
+                f"{type(self).__name__}.{how} does not implement numeric_only."
+            )
+
+        # This is overkill because it is only called once, but is here to
+        #  mirror the array_func used in DataFrameGroupBy._cython_agg_general
+        def array_func(values: ArrayLike) -> ArrayLike:
+            try:
+                result = self.grouper._cython_operation(
+                    "aggregate",
+                    values,
+                    how,
+                    axis=data.ndim - 1,
+                    min_count=min_count,
+                    skipna=skipna,
+                )
+            except NotImplementedError:
+                # generally if we have numeric_only=False
+                # and non-applicable functions
+                # try to python agg
+                # TODO: shouldn't min_count matter?
+                result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
+
+            return result
+
+        result = array_func(objvals)
+
+        ser = self.obj._constructor(
+            result, index=self.grouper.result_index, name=obj.name
+        )
+        return self._reindex_output(ser)
+
+=======
+>>>>>>> 2e29e1172bb5d17c5d6f4d8bec1d3e6452091822
     def _indexed_output_to_ndframe(
         self, output: Mapping[base.OutputKey, ArrayLike]
     ) -> Series:

@@ -1452,6 +1452,7 @@ def _agg_general(
         *,
         alias: str,
         npfunc: Callable,
+        skipna=True,
     ):
 
         with self._group_selection_context():
@@ -1461,6 +1462,7 @@ def _agg_general(
                 alt=npfunc,
                 numeric_only=numeric_only,
                 min_count=min_count,
+                skipna=skipna,
             )
             return result.__finalize__(self.obj, method="groupby")
 
@@ -1506,7 +1508,12 @@ def _agg_py_fallback(
 
     @final
     def _cython_agg_general(
-        self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
+        self,
+        how: str,
+        alt: Callable,
+        numeric_only: bool,
+        min_count: int = -1,
+        skipna: bool = False,
     ):
         # Note: we never get here with how="ohlc" for DataFrameGroupBy;
         #  that goes through SeriesGroupBy
@@ -2028,7 +2035,10 @@ def size(self) -> DataFrame | Series:
     @final
     @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
     def sum(
-        self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
+        self,
+        numeric_only: bool | lib.NoDefault = lib.no_default,
+        min_count: int = 0,
+        skipna: bool = True,
     ):
         numeric_only = self._resolve_numeric_only(numeric_only)
 
@@ -2041,19 +2051,27 @@ def sum(
                 min_count=min_count,
                 alias="add",
                 npfunc=np.sum,
+                skipna=skipna,
             )
 
         return self._reindex_output(result, fill_value=0)
 
     @final
     @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
     def prod(
-        self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
+        self,
+        numeric_only: bool | lib.NoDefault = lib.no_default,
+        min_count: int = 0,
+        skipna: bool = True,
     ):
         numeric_only = self._resolve_numeric_only(numeric_only)
 
         return self._agg_general(
-            numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
+            numeric_only=numeric_only,
+            min_count=min_count,
+            alias="prod",
+            npfunc=np.prod,
+            skipna=skipna,
         )
 
     @final

@@ -323,6 +323,7 @@ def _ea_wrap_cython_operation(
         min_count: int,
         ngroups: int,
         comp_ids: np.ndarray,
+        skipna: bool = True,
         **kwargs,
     ) -> ArrayLike:
         """
@@ -336,6 +337,7 @@ def _ea_wrap_cython_operation(
                 min_count=min_count,
                 ngroups=ngroups,
                 comp_ids=comp_ids,
+                skipna=skipna,
                 **kwargs,
             )
 
@@ -363,6 +365,7 @@ def _ea_wrap_cython_operation(
             ngroups=ngroups,
             comp_ids=comp_ids,
             mask=None,
+            skipna=skipna,
             **kwargs,
         )
 
@@ -399,6 +402,7 @@ def _masked_ea_wrap_cython_operation(
         min_count: int,
         ngroups: int,
         comp_ids: np.ndarray,
+        skipna: bool = True,
         **kwargs,
     ) -> BaseMaskedArray:
         """
@@ -419,6 +423,7 @@ def _masked_ea_wrap_cython_operation(
             comp_ids=comp_ids,
             mask=mask,
             result_mask=result_mask,
+            skipna=skipna,
             **kwargs,
         )
 
@@ -441,6 +446,7 @@ def _cython_op_ndim_compat(
         comp_ids: np.ndarray,
         mask: np.ndarray | None = None,
         result_mask: np.ndarray | None = None,
+        skipna: bool = True,
         **kwargs,
     ) -> np.ndarray:
         if values.ndim == 1:
@@ -457,6 +463,7 @@ def _cython_op_ndim_compat(
                 comp_ids=comp_ids,
                 mask=mask,
                 result_mask=result_mask,
+                skipna=skipna,
                 **kwargs,
             )
             if res.shape[0] == 1:
@@ -472,6 +479,7 @@ def _cython_op_ndim_compat(
             comp_ids=comp_ids,
             mask=mask,
             result_mask=result_mask,
+            skipna=skipna,
             **kwargs,
         )
 
@@ -485,6 +493,7 @@ def _call_cython_op(
         comp_ids: np.ndarray,
         mask: np.ndarray | None,
         result_mask: np.ndarray | None,
+        skipna: bool = True,
         **kwargs,
     ) -> np.ndarray:  # np.ndarray[ndim=2]
         orig_values = values
@@ -530,9 +539,10 @@ def _call_cython_op(
                     mask=mask,
                     result_mask=result_mask,
                     is_datetimelike=is_datetimelike,
+                    skipna=skipna,
                 )
             else:
-                func(result, counts, values, comp_ids, min_count)
+                func(result, counts, values, comp_ids, min_count, skipna=skipna)
         else:
             # TODO: min_count
             if self.uses_mask():
@@ -543,10 +553,19 @@ def _call_cython_op(
                     ngroups,
                     is_datetimelike,
                     mask=mask,
+                    skipna=skipna,
                     **kwargs,
                 )
             else:
-                func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
+                func(
+                    result,
+                    values,
+                    comp_ids,
+                    ngroups,
+                    is_datetimelike,
+                    skipna=skipna,
+                    **kwargs,
+                )
 
         if self.kind == "aggregate":
             # i.e. counts is defined.  Locations where count<min_count
@@ -580,9 +599,10 @@ def cython_operation(
         *,
         values: ArrayLike,
         axis: int,
-        min_count: int = -1,
         comp_ids: np.ndarray,
         ngroups: int,
+        min_count: int = -1,
+        skipna: bool = True,
         **kwargs,
     ) -> ArrayLike:
         """
@@ -611,6 +631,7 @@ def cython_operation(
                 min_count=min_count,
                 ngroups=ngroups,
                 comp_ids=comp_ids,
+                skipna=skipna,
                 **kwargs,
             )
 
@@ -620,6 +641,7 @@ def cython_operation(
             ngroups=ngroups,
             comp_ids=comp_ids,
             mask=None,
+            skipna=skipna,
             **kwargs,
         )
 
@@ -914,6 +936,7 @@ def _cython_operation(
         how: str,
         axis: int,
         min_count: int = -1,
+        skipna: bool = True,
         **kwargs,
     ) -> ArrayLike:
         """
@@ -931,6 +954,7 @@ def _cython_operation(
             min_count=min_count,
             comp_ids=ids,
             ngroups=ngroups,
+            skipna=skipna,
             **kwargs,
         )