diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ce0158b05c2ab..e74fb5602fc90 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -37,6 +37,7 @@ Other enhancements - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`42273`) - Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`) - Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`) +- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 354b87e03e6c4..91921ba0e64c2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1317,6 +1317,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, + bint skipna, bint compute_max): """ Cumulative minimum/maximum of columns of `values`, in row groups `labels`. @@ -1336,6 +1337,8 @@ cdef group_cummin_max(groupby_t[:, ::1] out, Number of groups, larger than all entries of `labels`. is_datetimelike : bool True if `values` contains datetime-like entries. + skipna : bool + If True, ignore nans in `values`. compute_max : bool True if cumulative maximum should be computed, False if cumulative minimum should be computed @@ -1356,9 +1359,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, accum[:] = -np.inf if compute_max else np.inf if mask is not None: - masked_cummin_max(out, values, mask, labels, accum, compute_max) + masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max) else: - cummin_max(out, values, labels, accum, is_datetimelike, compute_max) + cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max) @cython.boundscheck(False) @@ -1367,6 +1370,7 @@ cdef cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, groupby_t[:, ::1] accum, + bint skipna, bint is_datetimelike, bint compute_max): """ @@ -1375,8 +1379,24 @@ cdef cummin_max(groupby_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K - groupby_t val, mval + groupby_t val, mval, na_val + uint8_t[:, ::1] seen_na intp_t lab + bint na_possible + + if groupby_t is float64_t or groupby_t is float32_t: + na_val = NaN + na_possible = True + elif is_datetimelike: + na_val = NPY_NAT + na_possible = True + # Will never be used, just to avoid uninitialized warning + else: + na_val = 0 + na_possible = False + + if na_possible: + seen_na = np.zeros((accum).shape, dtype=np.uint8) N, K = (values).shape with nogil: @@ -1385,18 +1405,22 @@ cdef cummin_max(groupby_t[:, ::1] out, if lab < 0: continue for j in range(K): - val = values[i, j] - if not _treat_as_na(val, is_datetimelike): - mval = accum[lab, j] - if compute_max: - if val > mval: - accum[lab, j] = mval = val - else: - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + if not skipna and na_possible and seen_na[lab, j]: + out[i, j] = na_val else: - out[i, j] = val + val = values[i, j] + if not _treat_as_na(val, is_datetimelike): + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val + else: + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval + else: + seen_na[lab, j] = 1 + out[i, j] = val @cython.boundscheck(False) @@ -1406,6 +1430,7 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, uint8_t[:, ::1] mask, const intp_t[:] labels, groupby_t[:, ::1] accum, + bint skipna, bint compute_max): """ Compute the cumulative minimum/maximum of columns of `values`, in row groups @@ -1414,25 +1439,32 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, cdef: Py_ssize_t i, j, N, K groupby_t val, mval + uint8_t[:, ::1] seen_na intp_t lab N, K = (values).shape + seen_na = np.zeros((accum).shape, dtype=np.uint8) with nogil: for i in range(N): lab = labels[i] if lab < 0: continue for j in range(K): - if not mask[i, j]: - val = values[i, j] - mval = accum[lab, j] - if compute_max: - if val > mval: - accum[lab, j] = mval = val + if not skipna and seen_na[lab, j]: + mask[i, j] = 1 + else: + if not mask[i, j]: + val = values[i, j] + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val + else: + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval else: - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + seen_na[lab, j] = 1 @cython.boundscheck(False) @@ -1442,7 +1474,8 @@ def group_cummin(groupby_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, - uint8_t[:, ::1] mask=None) -> None: + uint8_t[:, ::1] mask=None, + bint skipna=True) -> None: """See group_cummin_max.__doc__""" group_cummin_max( out, @@ -1451,6 +1484,7 @@ def group_cummin(groupby_t[:, ::1] out, labels, ngroups, is_datetimelike, + skipna, compute_max=False ) @@ -1462,7 +1496,8 @@ def group_cummax(groupby_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, - uint8_t[:, ::1] mask=None) -> None: + uint8_t[:, ::1] mask=None, + bint skipna=True) -> None: """See group_cummin_max.__doc__""" group_cummin_max( out, @@ -1471,5 +1506,6 @@ def group_cummax(groupby_t[:, ::1] out, labels, ngroups, is_datetimelike, + skipna, compute_max=True ) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 939cff16bf1ae..e57e48cb3ab11 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2784,10 +2784,11 @@ def cummin(self, axis=0, **kwargs): ------- Series or DataFrame """ + skipna = kwargs.get("skipna", True) if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) - return self._cython_transform("cummin", numeric_only=False) + return self._cython_transform("cummin", numeric_only=False, skipna=skipna) @final @Substitution(name="groupby") @@ -2800,10 +2801,11 @@ def cummax(self, axis=0, **kwargs): ------- Series or DataFrame """ + skipna = kwargs.get("skipna", True) if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) - return self._cython_transform("cummax", numeric_only=False) + return self._cython_transform("cummax", numeric_only=False, skipna=skipna) @final def _get_cythonized_result( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 5434fc49e2174..77e5e9ba133f5 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -803,6 +803,39 @@ def test_cummax(dtypes_for_minmax): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) +@pytest.mark.parametrize( + "groups,expected_data", + [ + ([1, 1, 1], [1, None, None]), + ([1, 2, 3], [1, None, 2]), + ([1, 3, 3], [1, None, None]), + ], +) +def test_cummin_max_skipna(method, dtype, groups, expected_data): + # GH-34047 + df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) + gb = df.groupby(groups)["a"] + + result = getattr(gb, method)(skipna=False) + expected = Series(expected_data, dtype=dtype, name="a") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +def test_cummin_max_skipna_multiple_cols(method): + # Ensure missing value in "a" doesn't cause "b" to be nan-filled + df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) + gb = df.groupby([1, 1, 1])[["a", "b"]] + + result = getattr(gb, method)(skipna=False) + expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_32bit @pytest.mark.parametrize("method", ["cummin", "cummax"]) @pytest.mark.parametrize(