pandas-dev · geoffrey-eisenbarth · Sep 16, 2021 · Sep 20, 2021 · Sep 20, 2021 · Sep 27, 2021
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -117,7 +117,7 @@ Other enhancements
 - Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`)
 - :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`)
 - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
-- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
+- :meth:`.GroupBy.cummin`, :meth:`.GroupBy.cummax`, :meth:`.GroupBy.sum`, and :meth:`.GroupBy.mean` now support the argument ``skipna`` (:issue:`34047`)
 - :meth:`read_table` now supports the argument ``storage_options`` (:issue:`39167`)
 - :meth:`DataFrame.to_stata` and :meth:`StataWriter` now accept the keyword only argument ``value_labels`` to save labels for non-categorical columns
 - Methods that relied on hashmap based algos such as :meth:`DataFrameGroupBy.value_counts`, :meth:`DataFrameGroupBy.count` and :func:`factorize` ignored imaginary component for complex numbers (:issue:`17927`)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -487,7 +487,8 @@ def group_add(add_t[:, ::1] out,
               int64_t[::1] counts,
               ndarray[add_t, ndim=2] values,
               const intp_t[::1] labels,
-              Py_ssize_t min_count=0) -> None:
+              Py_ssize_t min_count=0,
+              bint skipna=True) -> None:
     """
     Only aggregates on axis=0 using Kahan summation
     """
@@ -530,6 +531,9 @@ def group_add(add_t[:, ::1] out,
                     else:
                         t = sumx[lab, j] + val
                     sumx[lab, j] = t
+                elif skipna == False:
+                    # NOTE: Does this case need to be considered?
+                    pass
 
         for i in range(ncounts):
             for j in range(K):
@@ -555,6 +559,10 @@ def group_add(add_t[:, ::1] out,
                         t = sumx[lab, j] + y
                         compensation[lab, j] = t - sumx[lab, j] - y
                         sumx[lab, j] = t
+                    # don't skip nan
+                    elif skipna == False:
-                    elif skipna == False:
+                    elif not skipna:
-                    elif skipna == False:
+                    elif not skipna:
+                        sumx[lab, j] = NAN
+                        break
 
             for i in range(ncounts):
                 for j in range(K):
@@ -570,7 +578,8 @@ def group_prod(floating[:, ::1] out,
                int64_t[::1] counts,
                ndarray[floating, ndim=2] values,
                const intp_t[::1] labels,
-               Py_ssize_t min_count=0) -> None:
+               Py_ssize_t min_count=0,
+               bint skipna=True) -> None:
     """
     Only aggregates on axis=0
     """
@@ -603,6 +612,10 @@ def group_prod(floating[:, ::1] out,
                 if val == val:
                     nobs[lab, j] += 1
                     prodx[lab, j] *= val
+                # don't skip nan
+                elif skipna == False:
-                elif skipna == False:
+                elif not skipna:
-                elif skipna == False:
+                elif not skipna:
+                    prodx[lab, j] = NAN
+                    break
 
         for i in range(ncounts):
             for j in range(K):
@@ -620,6 +633,7 @@ def group_var(floating[:, ::1] out,
               ndarray[floating, ndim=2] values,
               const intp_t[::1] labels,
               Py_ssize_t min_count=-1,
+              bint skipna=True,
               int64_t ddof=1) -> None:
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
@@ -709,6 +723,11 @@ def group_mean(floating[:, ::1] out,
                     t = sumx[lab, j] + y
                     compensation[lab, j] = t - sumx[lab, j] - y
                     sumx[lab, j] = t
+                # don't skip nan
+                elif skipna == False:
+                    # NOTE: Unsure about this, should this loop break here?
+                    sumx[lab, j] = NAN
+                    break
 
         for i in range(ncounts):
             for j in range(K):

@@ -329,7 +329,12 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame:
         return output
 
     def _cython_agg_general(
-        self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
+        self,
+        how: str,
+        alt: Callable,
+        numeric_only: bool,
+        min_count: int = -1,
+        skipna: bool = True,
     ):
 
         obj = self._selected_obj
@@ -347,7 +352,12 @@ def _cython_agg_general(
         def array_func(values: ArrayLike) -> ArrayLike:
             try:
                 result = self.grouper._cython_operation(
-                    "aggregate", values, how, axis=data.ndim - 1, min_count=min_count
+                    "aggregate",
+                    values,
+                    how,
+                    axis=data.ndim - 1,
+                    min_count=min_count,
+                    skipna=skipna,
                 )
             except NotImplementedError:
                 # generally if we have numeric_only=False

@@ -1448,6 +1448,7 @@ def _agg_general(
         *,
         alias: str,
         npfunc: Callable,
+        skipna=True,
     ):
 
         with group_selection_context(self):
@@ -1457,6 +1458,7 @@ def _agg_general(
                 alt=npfunc,
                 numeric_only=numeric_only,
                 min_count=min_count,
+                skipna=skipna,
             )
             return result.__finalize__(self.obj, method="groupby")
 
@@ -1501,7 +1503,12 @@ def _agg_py_fallback(
         return ensure_block_shape(res_values, ndim=ndim)
 
     def _cython_agg_general(
-        self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
+        self,
+        how: str,
+        alt: Callable,
+        numeric_only: bool,
+        min_count: int = -1,
+        skipna: bool = False,
     ):
         raise AbstractMethodError(self)
 
@@ -1967,7 +1974,10 @@ def size(self) -> DataFrame | Series:
     @final
     @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
     def sum(
-        self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
+        self,
+        numeric_only: bool | lib.NoDefault = lib.no_default,
+        min_count: int = 0,
+        skipna: bool = True,
     ):
         numeric_only = self._resolve_numeric_only(numeric_only)
 
@@ -1980,19 +1990,27 @@ def sum(
                 min_count=min_count,
                 alias="add",
                 npfunc=np.sum,
+                skipna=skipna,
             )
 
         return self._reindex_output(result, fill_value=0)
 
     @final
     @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
     def prod(
-        self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
+        self,
+        numeric_only: bool | lib.NoDefault = lib.no_default,
+        min_count: int = 0,
+        skipna: bool = True,
     ):
         numeric_only = self._resolve_numeric_only(numeric_only)
 
         return self._agg_general(
-            numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
+            numeric_only=numeric_only,
+            min_count=min_count,
+            alias="prod",
+            npfunc=np.prod,
+            skipna=skipna,
         )
 
     @final

@@ -322,6 +322,7 @@ def _ea_wrap_cython_operation(
         min_count: int,
         ngroups: int,
         comp_ids: np.ndarray,
+        skipna: bool = True,
         **kwargs,
     ) -> ArrayLike:
         """
@@ -335,6 +336,7 @@ def _ea_wrap_cython_operation(
                 min_count=min_count,
                 ngroups=ngroups,
                 comp_ids=comp_ids,
+                skipna=skipna,
                 **kwargs,
             )
 
@@ -359,6 +361,7 @@ def _ea_wrap_cython_operation(
             ngroups=ngroups,
             comp_ids=comp_ids,
             mask=None,
+            skipna=skipna,
             **kwargs,
         )
 
@@ -393,6 +396,7 @@ def _masked_ea_wrap_cython_operation(
         min_count: int,
         ngroups: int,
         comp_ids: np.ndarray,
+        skipna: bool = True,
         **kwargs,
     ) -> BaseMaskedArray:
         """
@@ -413,6 +417,7 @@ def _masked_ea_wrap_cython_operation(
             comp_ids=comp_ids,
             mask=mask,
             result_mask=result_mask,
+            skipna=skipna,
             **kwargs,
         )
 
@@ -435,6 +440,7 @@ def _cython_op_ndim_compat(
         comp_ids: np.ndarray,
         mask: np.ndarray | None = None,
         result_mask: np.ndarray | None = None,
+        skipna: bool = True,
         **kwargs,
     ) -> np.ndarray:
         if values.ndim == 1:
@@ -451,6 +457,7 @@ def _cython_op_ndim_compat(
                 comp_ids=comp_ids,
                 mask=mask,
                 result_mask=result_mask,
+                skipna=skipna,
                 **kwargs,
             )
             if res.shape[0] == 1:
@@ -466,6 +473,7 @@ def _cython_op_ndim_compat(
             comp_ids=comp_ids,
             mask=mask,
             result_mask=result_mask,
+            skipna=skipna,
             **kwargs,
         )
 
@@ -479,6 +487,7 @@ def _call_cython_op(
         comp_ids: np.ndarray,
         mask: np.ndarray | None,
         result_mask: np.ndarray | None,
+        skipna: bool = True,
         **kwargs,
     ) -> np.ndarray:  # np.ndarray[ndim=2]
         orig_values = values
@@ -524,9 +533,10 @@ def _call_cython_op(
                     mask=mask,
                     result_mask=result_mask,
                     is_datetimelike=is_datetimelike,
+                    skipna=skipna,
                 )
             else:
-                func(result, counts, values, comp_ids, min_count)
+                func(result, counts, values, comp_ids, min_count, skipna=skipna)
         else:
             # TODO: min_count
             if self.uses_mask():
@@ -537,10 +547,19 @@ def _call_cython_op(
                     ngroups,
                     is_datetimelike,
                     mask=mask,
+                    skipna=skipna,
                     **kwargs,
                 )
             else:
-                func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
+                func(
+                    result,
+                    values,
+                    comp_ids,
+                    ngroups,
+                    is_datetimelike,
+                    skipna=skipna,
+                    **kwargs,
+                )
 
         if self.kind == "aggregate":
             # i.e. counts is defined.  Locations where count<min_count
@@ -574,9 +593,10 @@ def cython_operation(
         *,
         values: ArrayLike,
         axis: int,
-        min_count: int = -1,
         comp_ids: np.ndarray,
         ngroups: int,
+        min_count: int = -1,
+        skipna: bool = True,
         **kwargs,
     ) -> ArrayLike:
         """
@@ -605,6 +625,7 @@ def cython_operation(
                 min_count=min_count,
                 ngroups=ngroups,
                 comp_ids=comp_ids,
+                skipna=skipna,
                 **kwargs,
             )
 
@@ -614,6 +635,7 @@ def cython_operation(
             ngroups=ngroups,
             comp_ids=comp_ids,
             mask=None,
+            skipna=skipna,
             **kwargs,
         )
 
@@ -890,6 +912,7 @@ def _cython_operation(
         how: str,
         axis: int,
         min_count: int = -1,
+        skipna: bool = True,
         **kwargs,
     ) -> ArrayLike:
         """
@@ -907,6 +930,7 @@ def _cython_operation(
             min_count=min_count,
             comp_ids=ids,
             ngroups=ngroups,
+            skipna=skipna,
             **kwargs,
         )