pandas-dev · jreback · Jul 31, 2021 · Jun 6, 2021 · Jun 6, 2021 · Jun 7, 2021
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -230,6 +230,7 @@ Other enhancements
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
 - :meth:`.GroupBy.rank` now supports object-dtype data (:issue:`38278`)
+- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
 - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
 - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
 - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -1317,6 +1317,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
                       const intp_t[:] labels,
                       int ngroups,
                       bint is_datetimelike,
+                      bint skipna,
                       bint compute_max):
     """
     Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
@@ -1336,6 +1337,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
         Number of groups, larger than all entries of `labels`.
     is_datetimelike : bool
         True if `values` contains datetime-like entries.
+    skipna : bool
+        If True, ignore nans in `values`.
+
     compute_max : bool
         True if cumulative maximum should be computed, False
         if cumulative minimum should be computed
@@ -1345,61 +1349,114 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
     This method modifies the `out` parameter, rather than returning an object.
     """
     cdef:
-        Py_ssize_t i, j, N, K, size
-        groupby_t val, mval
         groupby_t[:, ::1] accum
-        intp_t lab
-        bint val_is_nan, use_mask
-
-    use_mask = mask is not None
 
-    N, K = (<object>values).shape
-    accum = np.empty((ngroups, K), dtype=values.dtype)
+    accum = np.empty((ngroups, (<object>values).shape[1]), dtype=values.dtype)
     if groupby_t is int64_t:
         accum[:] = -_int64_max if compute_max else _int64_max
     elif groupby_t is uint64_t:
         accum[:] = 0 if compute_max else np.iinfo(np.uint64).max
     else:
         accum[:] = -np.inf if compute_max else np.inf
 
+    if mask is not None:
+        masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max)
+    else:
+        cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef cummin_max(groupby_t[:, ::1] out,
+                ndarray[groupby_t, ndim=2] values,
+                const intp_t[:] labels,
+                groupby_t[:, ::1] accum,
+                bint skipna,
+                bint is_datetimelike,
+                bint compute_max):
+    """
+    Compute the cumulative minimum/maximum of columns of `values`, in row groups
+    `labels`.
+    """
+    cdef:
+        Py_ssize_t i, j, N, K
+        groupby_t val, mval, na_val
+        uint8_t[::1] seen_na
+        intp_t lab
+
+    if groupby_t is float64_t or groupby_t is float32_t:
+        na_val = NaN
+    elif is_datetimelike:
+        na_val = NPY_NAT
+
+    N, K = (<object>values).shape
+    seen_na = np.zeros((<object>accum).shape[0], dtype=np.uint8)
     with nogil:
         for i in range(N):
             lab = labels[i]
-
             if lab < 0:
                 continue
             for j in range(K):
-                val_is_nan = False
-
-                if use_mask:
-                    if mask[i, j]:
-
-                        # `out` does not need to be set since it
-                        # will be masked anyway
-                        val_is_nan = True
-                    else:
-
-                        # If using the mask, we can avoid grabbing the
-                        # value unless necessary
-                        val = values[i, j]
-
-                # Otherwise, `out` must be set accordingly if the
-                # value is missing
+                if not skipna and seen_na[lab]:
+                    out[i, j] = na_val
                 else:
                     val = values[i, j]
-                    if _treat_as_na(val, is_datetimelike):
-                        val_is_nan = True
+                    if not _treat_as_na(val, is_datetimelike):
+                        mval = accum[lab, j]
+                        if compute_max:
+                            if val > mval:
+                                accum[lab, j] = mval = val
+                        else:
+                            if val < mval:
+                                accum[lab, j] = mval = val
+                        out[i, j] = mval
+                    else:
+                        seen_na[lab] = 1
                         out[i, j] = val
 
-                if not val_is_nan:
-                    mval = accum[lab, j]
-                    if compute_max:
-                        if val > mval:
-                            accum[lab, j] = mval = val
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef masked_cummin_max(groupby_t[:, ::1] out,
+                       ndarray[groupby_t, ndim=2] values,
+                       uint8_t[:, ::1] mask,
+                       const intp_t[:] labels,
+                       groupby_t[:, ::1] accum,
+                       bint skipna,
+                       bint compute_max):
+    """
+    Compute the cumulative minimum/maximum of columns of `values`, in row groups
+    `labels` with a masked algorithm.
+    """
+    cdef:
+        Py_ssize_t i, j, N, K
+        groupby_t val, mval
+        uint8_t[::1] seen_na
+        intp_t lab
+
+    N, K = (<object>values).shape
+    seen_na = np.zeros((<object>accum).shape[0], dtype=np.uint8)
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+            for j in range(K):
+                if not skipna and seen_na[lab]:
+                    mask[i, j] = 1
+                else:
+                    if not mask[i, j]:
+                        val = values[i, j]
+                        mval = accum[lab, j]
+                        if compute_max:
+                            if val > mval:
+                                accum[lab, j] = mval = val
+                        else:
+                            if val < mval:
+                                accum[lab, j] = mval = val
+                        out[i, j] = mval
                     else:
-                        if val < mval:
-                            accum[lab, j] = mval = val
-                    out[i, j] = mval
+                        seen_na[lab] = 1
 
 
 @cython.boundscheck(False)
@@ -1409,7 +1466,8 @@ def group_cummin(groupby_t[:, ::1] out,
                  const intp_t[:] labels,
                  int ngroups,
                  bint is_datetimelike,
-                 uint8_t[:, ::1] mask=None) -> None:
+                 uint8_t[:, ::1] mask=None,
+                 bint skipna=True) -> None:
     """See group_cummin_max.__doc__"""
     group_cummin_max(
         out,
@@ -1418,6 +1476,7 @@ def group_cummin(groupby_t[:, ::1] out,
         labels,
         ngroups,
         is_datetimelike,
+        skipna,
         compute_max=False
     )
 
@@ -1429,7 +1488,8 @@ def group_cummax(groupby_t[:, ::1] out,
                  const intp_t[:] labels,
                  int ngroups,
                  bint is_datetimelike,
-                 uint8_t[:, ::1] mask=None) -> None:
+                 uint8_t[:, ::1] mask=None,
+                 bint skipna=True) -> None:
     """See group_cummin_max.__doc__"""
     group_cummin_max(
         out,
@@ -1438,5 +1498,6 @@ def group_cummax(groupby_t[:, ::1] out,
         labels,
         ngroups,
         is_datetimelike,
+        skipna,
         compute_max=True
     )
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2737,10 +2737,11 @@ def cummin(self, axis=0, **kwargs):
         -------
         Series or DataFrame
         """
+        skipna = kwargs.get("skipna", True)
         if axis != 0:
             return self.apply(lambda x: np.minimum.accumulate(x, axis))
 
-        return self._cython_transform("cummin", numeric_only=False)
+        return self._cython_transform("cummin", numeric_only=False, skipna=skipna)
 
     @final
     @Substitution(name="groupby")
@@ -2753,10 +2754,11 @@ def cummax(self, axis=0, **kwargs):
         -------
         Series or DataFrame
         """
+        skipna = kwargs.get("skipna", True)
         if axis != 0:
             return self.apply(lambda x: np.maximum.accumulate(x, axis))
 
-        return self._cython_transform("cummax", numeric_only=False)
+        return self._cython_transform("cummax", numeric_only=False, skipna=skipna)
 
     @final
     def _get_cythonized_result(

diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -904,6 +904,24 @@ def test_cummax(dtypes_for_minmax):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize("method", ["cummin", "cummax"])
+@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
+@pytest.mark.parametrize(
+    "groups,expected_data",
+    [
+        ([1, 1, 1], [1, None, None]),
+        ([1, 2, 3], [1, None, 2]),
+        ([1, 3, 3], [1, None, None]),
+    ],
+)
+def test_cummin_max_skipna(method, dtype, groups, expected_data):
+    # GH-34047
+    df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
+    result = getattr(df.groupby(groups)["a"], method)(skipna=False)
+    expected = Series(expected_data, dtype=dtype, name="a")
+    tm.assert_series_equal(result, expected)
+
+
 @td.skip_if_32bit
 @pytest.mark.parametrize("method", ["cummin", "cummax"])
 @pytest.mark.parametrize(