From 97d7e11082270d5813fc5c700746bd753c410b09 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 6 Jun 2021 16:56:17 -0400 Subject: [PATCH 01/11] precommit fixup --- pandas/_libs/groupby.pyx | 89 ++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 26 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index b72b927b3c2a8..40933c8f30319 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1345,13 +1345,8 @@ cdef group_cummin_max(groupby_t[:, ::1] out, This method modifies the `out` parameter, rather than returning an object. """ cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval + Py_ssize_t N, K groupby_t[:, ::1] accum - intp_t lab - bint val_is_nan, use_mask - - use_mask = mask is not None N, K = (values).shape accum = np.empty((ngroups, K), dtype=values.dtype) @@ -1362,36 +1357,78 @@ cdef group_cummin_max(groupby_t[:, ::1] out, else: accum[:] = -np.inf if compute_max else np.inf + if mask is not None: + masked_cummin_max(out, values, mask, labels, accum, N, K, compute_max) + else: + cummin_max(out, values, labels, accum, N, K, is_datetimelike, compute_max) + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef cummin_max(groupby_t[:, ::1] out, + ndarray[groupby_t, ndim=2] values, + const intp_t[:] labels, + groupby_t[:, ::1] accum, + Py_ssize_t N, + Py_ssize_t K, + bint is_datetimelike, + bint compute_max): + """ + Compute the cumulative minimum/maximum of columns of `values`, in row groups + `labels`. + """ + cdef: + Py_ssize_t i, j + groupby_t val, mval + intp_t lab + with nogil: for i in range(N): lab = labels[i] - if lab < 0: continue for j in range(K): - val_is_nan = False - - if use_mask: - if mask[i, j]: - - # `out` does not need to be set since it - # will be masked anyway - val_is_nan = True + val = values[i, j] + if not _treat_as_na(val, is_datetimelike): + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val else: + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval + else: + out[i, j] = val - # If using the mask, we can avoid grabbing the - # value unless necessary - val = values[i, j] - # Otherwise, `out` must be set accordingly if the - # value is missing - else: - val = values[i, j] - if _treat_as_na(val, is_datetimelike): - val_is_nan = True - out[i, j] = val +@cython.boundscheck(False) +@cython.wraparound(False) +cdef masked_cummin_max(groupby_t[:, ::1] out, + ndarray[groupby_t, ndim=2] values, + uint8_t[:, ::1] mask, + const intp_t[:] labels, + groupby_t[:, ::1] accum, + Py_ssize_t N, + Py_ssize_t K, + bint compute_max): + """ + Compute the cumulative minimum/maximum of columns of `values`, in row groups + `labels` with a masked algorithm. + """ + cdef: + Py_ssize_t i, j + groupby_t val, mval + intp_t lab - if not val_is_nan: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + for j in range(K): + if not mask[i, j]: + val = values[i, j] mval = accum[lab, j] if compute_max: if val > mval: From ee7a7ef627c475faa3a6e3b45a91ab204a8cf84a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 6 Jun 2021 18:51:32 -0400 Subject: [PATCH 02/11] precommit fixup --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_libs/groupby.pyx | 88 +++++++++++++++++---------- pandas/core/groupby/groupby.py | 20 ++++-- pandas/tests/groupby/test_function.py | 10 +++ 4 files changed, 84 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ce613fd78c1e1..44c9a39d30932 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -230,6 +230,7 @@ Other enhancements - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`) - :meth:`.GroupBy.rank` now supports object-dtype data (:issue:`38278`) +- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`) - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 40933c8f30319..75ac94975e8b7 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1317,6 +1317,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, + bint skipna, bint compute_max): """ Cumulative minimum/maximum of columns of `values`, in row groups `labels`. @@ -1336,6 +1337,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, Number of groups, larger than all entries of `labels`. is_datetimelike : bool True if `values` contains datetime-like entries. + skipna : bool + If True, ignore nans in `values`. + compute_max : bool True if cumulative maximum should be computed, False if cumulative minimum should be computed @@ -1358,9 +1362,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, accum[:] = -np.inf if compute_max else np.inf if mask is not None: - masked_cummin_max(out, values, mask, labels, accum, N, K, compute_max) + masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max) else: - cummin_max(out, values, labels, accum, N, K, is_datetimelike, compute_max) + cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max) @cython.boundscheck(False) @@ -1369,8 +1373,7 @@ cdef cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, groupby_t[:, ::1] accum, - Py_ssize_t N, - Py_ssize_t K, + bint skipna, bint is_datetimelike, bint compute_max): """ @@ -1378,28 +1381,40 @@ cdef cummin_max(groupby_t[:, ::1] out, `labels`. """ cdef: - Py_ssize_t i, j - groupby_t val, mval + Py_ssize_t i, j, N, K + groupby_t val, mval, na_val + uint8_t[::1] seen_na intp_t lab + if groupby_t is float64_t or groupby_t is float32_t: + na_val = NaN + elif is_datetimelike: + na_val = NPY_NAT + + seen_na = np.zeros((values).shape[0], dtype=np.uint8) + N, K = (values).shape with nogil: for i in range(N): lab = labels[i] if lab < 0: continue for j in range(K): - val = values[i, j] - if not _treat_as_na(val, is_datetimelike): - mval = accum[lab, j] - if compute_max: - if val > mval: - accum[lab, j] = mval = val - else: - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + if not skipna and seen_na[lab]: + out[i, j] = na_val else: - out[i, j] = val + val = values[i, j] + if not _treat_as_na(val, is_datetimelike): + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val + else: + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval + else: + seen_na[lab] = 1 + out[i, j] = val @cython.boundscheck(False) @@ -1409,34 +1424,41 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, uint8_t[:, ::1] mask, const intp_t[:] labels, groupby_t[:, ::1] accum, - Py_ssize_t N, - Py_ssize_t K, + bint skipna, bint compute_max): """ Compute the cumulative minimum/maximum of columns of `values`, in row groups `labels` with a masked algorithm. """ cdef: - Py_ssize_t i, j + Py_ssize_t i, j, N, K groupby_t val, mval + uint8_t[::1] seen_na intp_t lab + seen_na = np.zeros((values).shape[0], dtype=np.uint8) + N, K = (values).shape with nogil: for i in range(N): lab = labels[i] if lab < 0: continue for j in range(K): - if not mask[i, j]: - val = values[i, j] - mval = accum[lab, j] - if compute_max: - if val > mval: - accum[lab, j] = mval = val + if not skipna and seen_na[lab]: + mask[i, j] = 1 + else: + if not mask[i, j]: + val = values[i, j] + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val + else: + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval else: - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + seen_na[lab] = 1 @cython.boundscheck(False) @@ -1446,7 +1468,8 @@ def group_cummin(groupby_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, - uint8_t[:, ::1] mask=None) -> None: + uint8_t[:, ::1] mask=None, + bint skipna=True) -> None: """See group_cummin_max.__doc__""" group_cummin_max( out, @@ -1455,6 +1478,7 @@ def group_cummin(groupby_t[:, ::1] out, labels, ngroups, is_datetimelike, + skipna, compute_max=False ) @@ -1466,7 +1490,8 @@ def group_cummax(groupby_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, - uint8_t[:, ::1] mask=None) -> None: + uint8_t[:, ::1] mask=None, + bint skipna=True) -> None: """See group_cummin_max.__doc__""" group_cummin_max( out, @@ -1475,5 +1500,6 @@ def group_cummax(groupby_t[:, ::1] out, labels, ngroups, is_datetimelike, + skipna, compute_max=True ) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6deb5bb1a76f0..61b16392c43b5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2729,7 +2729,7 @@ def cumsum(self, axis=0, *args, **kwargs): @final @Substitution(name="groupby") @Appender(_common_see_also) - def cummin(self, axis=0, **kwargs): + def cummin(self, axis=0, *args, **kwargs): """ Cumulative min for each group. @@ -2737,15 +2737,21 @@ def cummin(self, axis=0, **kwargs): ------- Series or DataFrame """ + # See groupby/test_function::test_cummin_cummax, current behavior allows + # but ignores numeric_only + if "numeric_only" in kwargs: + kwargs = kwargs.copy() + del kwargs["numeric_only"] + nv.validate_groupby_func("cummin", args, kwargs, ["skipna"]) if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) - return self._cython_transform("cummin", numeric_only=False) + return self._cython_transform("cummin", numeric_only=False, **kwargs) @final @Substitution(name="groupby") @Appender(_common_see_also) - def cummax(self, axis=0, **kwargs): + def cummax(self, axis=0, *args, **kwargs): """ Cumulative max for each group. @@ -2753,10 +2759,16 @@ def cummax(self, axis=0, **kwargs): ------- Series or DataFrame """ + # See groupby/test_function::test_cummin_cummax, current behavior allows + # but ignores numeric_only + if "numeric_only" in kwargs: + kwargs = kwargs.copy() + del kwargs["numeric_only"] + nv.validate_groupby_func("cummax", args, kwargs, ["skipna"]) if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) - return self._cython_transform("cummax", numeric_only=False) + return self._cython_transform("cummax", numeric_only=False, **kwargs) @final def _get_cythonized_result( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 95bb010015f62..b79eb15c1e835 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -904,6 +904,16 @@ def test_cummax(dtypes_for_minmax): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) +def test_cummin_max_skipna(method, dtype): + # GH-34047 + df = DataFrame({"a": Series([1, None, 1], dtype=dtype)}) + result = getattr(df.groupby([1, 1, 1])["a"], method)(skipna=False) + expected = Series([1, None, None], dtype=dtype, name="a") + tm.assert_series_equal(result, expected) + + @td.skip_if_32bit @pytest.mark.parametrize("method", ["cummin", "cummax"]) @pytest.mark.parametrize( From 885f8ae956e918c6cfe360bd2cb66aebbd5879cc Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 6 Jun 2021 20:18:47 -0400 Subject: [PATCH 03/11] wip --- pandas/_libs/groupby.pyx | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 40933c8f30319..ee57ff5c2a205 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1358,9 +1358,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, accum[:] = -np.inf if compute_max else np.inf if mask is not None: - masked_cummin_max(out, values, mask, labels, accum, N, K, compute_max) + masked_cummin_max(out, values, mask, labels, accum, compute_max) else: - cummin_max(out, values, labels, accum, N, K, is_datetimelike, compute_max) + cummin_max(out, values, labels, accum, is_datetimelike, compute_max) @cython.boundscheck(False) @@ -1369,8 +1369,6 @@ cdef cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, groupby_t[:, ::1] accum, - Py_ssize_t N, - Py_ssize_t K, bint is_datetimelike, bint compute_max): """ @@ -1378,10 +1376,11 @@ cdef cummin_max(groupby_t[:, ::1] out, `labels`. """ cdef: - Py_ssize_t i, j + Py_ssize_t i, j, N, K groupby_t val, mval intp_t lab + N, K = (values).shape with nogil: for i in range(N): lab = labels[i] @@ -1409,18 +1408,17 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, uint8_t[:, ::1] mask, const intp_t[:] labels, groupby_t[:, ::1] accum, - Py_ssize_t N, - Py_ssize_t K, bint compute_max): """ Compute the cumulative minimum/maximum of columns of `values`, in row groups `labels` with a masked algorithm. """ cdef: - Py_ssize_t i, j + Py_ssize_t i, j, N, K groupby_t val, mval intp_t lab + N, K = (values).shape with nogil: for i in range(N): lab = labels[i] From 51f8f9c8ccd0def2c0a4f93dc398a89ab94e986e Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 7 Jun 2021 11:00:19 -0400 Subject: [PATCH 04/11] Remove unused --- pandas/_libs/groupby.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ee57ff5c2a205..0e0598c3264e8 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1345,11 +1345,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, This method modifies the `out` parameter, rather than returning an object. """ cdef: - Py_ssize_t N, K groupby_t[:, ::1] accum - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=values.dtype) + accum = np.empty((ngroups, (values).shape[1]), dtype=values.dtype) if groupby_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max elif groupby_t is uint64_t: From 21e515758eeb591643ea704aec861077e7ade469 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 7 Jun 2021 11:24:23 -0400 Subject: [PATCH 05/11] Add multigroup tests --- pandas/tests/groupby/test_function.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index b79eb15c1e835..478d903ec791c 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -906,11 +906,19 @@ def test_cummax(dtypes_for_minmax): @pytest.mark.parametrize("method", ["cummin", "cummax"]) @pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) -def test_cummin_max_skipna(method, dtype): +@pytest.mark.parametrize( + "groups,expected_data", + [ + ([1, 1, 1], [1, None, None]), + ([1, 2, 3], [1, None, 2]), + ([1, 3, 3], [1, None, None]), + ], +) +def test_cummin_max_skipna(method, dtype, groups, expected_data): # GH-34047 - df = DataFrame({"a": Series([1, None, 1], dtype=dtype)}) - result = getattr(df.groupby([1, 1, 1])["a"], method)(skipna=False) - expected = Series([1, None, None], dtype=dtype, name="a") + df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) + result = getattr(df.groupby(groups)["a"], method)(skipna=False) + expected = Series(expected_data, dtype=dtype, name="a") tm.assert_series_equal(result, expected) From 7858fc657986f80ca84538261664c04e69adc55e Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 7 Jun 2021 11:34:40 -0400 Subject: [PATCH 06/11] Cleaner kwargs handling --- pandas/core/groupby/groupby.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 61b16392c43b5..eca4915c49732 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2729,7 +2729,7 @@ def cumsum(self, axis=0, *args, **kwargs): @final @Substitution(name="groupby") @Appender(_common_see_also) - def cummin(self, axis=0, *args, **kwargs): + def cummin(self, axis=0, **kwargs): """ Cumulative min for each group. @@ -2737,21 +2737,16 @@ def cummin(self, axis=0, *args, **kwargs): ------- Series or DataFrame """ - # See groupby/test_function::test_cummin_cummax, current behavior allows - # but ignores numeric_only - if "numeric_only" in kwargs: - kwargs = kwargs.copy() - del kwargs["numeric_only"] - nv.validate_groupby_func("cummin", args, kwargs, ["skipna"]) + skipna = kwargs.get("skipna", True) if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) - return self._cython_transform("cummin", numeric_only=False, **kwargs) + return self._cython_transform("cummin", numeric_only=False, skipna=skipna) @final @Substitution(name="groupby") @Appender(_common_see_also) - def cummax(self, axis=0, *args, **kwargs): + def cummax(self, axis=0, **kwargs): """ Cumulative max for each group. @@ -2759,16 +2754,11 @@ def cummax(self, axis=0, *args, **kwargs): ------- Series or DataFrame """ - # See groupby/test_function::test_cummin_cummax, current behavior allows - # but ignores numeric_only - if "numeric_only" in kwargs: - kwargs = kwargs.copy() - del kwargs["numeric_only"] - nv.validate_groupby_func("cummax", args, kwargs, ["skipna"]) + skipna = kwargs.get("skipna", True) if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) - return self._cython_transform("cummax", numeric_only=False, **kwargs) + return self._cython_transform("cummax", numeric_only=False, skipna=skipna) @final def _get_cythonized_result( From 3c869ce48863d43ee1c6bccfca5ee46fae791b7b Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 7 Jun 2021 11:51:44 -0400 Subject: [PATCH 07/11] Fix build warning --- pandas/_libs/groupby.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 60e4c1dbc6a40..a25069f4bb0fd 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1388,6 +1388,9 @@ cdef cummin_max(groupby_t[:, ::1] out, na_val = NaN elif is_datetimelike: na_val = NPY_NAT + # Will never be used, just to avoid uninitialized warning + else: + na_val = 0 N, K = (values).shape seen_na = np.zeros((accum).shape[0], dtype=np.uint8) From c448837197f43b58a36b840e54775f7409da4c4a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 12 Jun 2021 13:45:18 -0400 Subject: [PATCH 08/11] Fixup merge conflict and move whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 - pandas/_libs/groupby.pyx | 11 +++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 3e63054e6a2cf..dd95f9088e3da 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -268,7 +268,6 @@ Other enhancements - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`) - :meth:`.GroupBy.rank` now supports object-dtype data (:issue:`38278`) -- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`) - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 32dbac106c4ba..c8c02ba3c605c 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1339,7 +1339,6 @@ cdef group_cummin_max(groupby_t[:, ::1] out, True if `values` contains datetime-like entries. skipna : bool If True, ignore nans in `values`. - compute_max : bool True if cumulative maximum should be computed, False if cumulative minimum should be computed @@ -1359,14 +1358,14 @@ cdef group_cummin_max(groupby_t[:, ::1] out, else: accum[:] = -np.inf if compute_max else np.inf - if mask is not None: + if mask is None: cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max) else: masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max) -@cython.boundscheck(False) -@cython.wraparound(False) +# @cython.boundscheck(False) +# @cython.wraparound(False) cdef cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, @@ -1420,8 +1419,8 @@ cdef cummin_max(groupby_t[:, ::1] out, out[i, j] = val -@cython.boundscheck(False) -@cython.wraparound(False) +# @cython.boundscheck(False) +# @cython.wraparound(False) cdef masked_cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, uint8_t[:, ::1] mask, From 9333d5831de27cafe84615c19bba0d4fc3ade9a2 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 12 Jun 2021 13:50:30 -0400 Subject: [PATCH 09/11] Shrink diff --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/_libs/groupby.pyx | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 166ea2f0d4164..942f69e1ea4bb 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -29,7 +29,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c8c02ba3c605c..f916a5c8b755a 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1358,14 +1358,14 @@ cdef group_cummin_max(groupby_t[:, ::1] out, else: accum[:] = -np.inf if compute_max else np.inf - if mask is None: - cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max) - else: + if mask is not None: masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max) + else: + cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max) -# @cython.boundscheck(False) -# @cython.wraparound(False) +@cython.boundscheck(False) +@cython.wraparound(False) cdef cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, @@ -1419,8 +1419,8 @@ cdef cummin_max(groupby_t[:, ::1] out, out[i, j] = val -# @cython.boundscheck(False) -# @cython.wraparound(False) +@cython.boundscheck(False) +@cython.wraparound(False) cdef masked_cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, uint8_t[:, ::1] mask, From dfe8b73e842f87ec3fa3fe561a5e6c3d2f3dec49 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 25 Jun 2021 21:29:04 -0400 Subject: [PATCH 10/11] Clean up test --- pandas/tests/groupby/test_function.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index fb658b0a2cadb..161e4e9cd67f4 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -816,8 +816,11 @@ def test_cummax(dtypes_for_minmax): def test_cummin_max_skipna(method, dtype, groups, expected_data): # GH-34047 df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) - result = getattr(df.groupby(groups)["a"], method)(skipna=False) + gb = df.groupby(groups)["a"] + + result = getattr(gb, method)(skipna=False) expected = Series(expected_data, dtype=dtype, name="a") + tm.assert_series_equal(result, expected) From 5ccdf94c90b9fe3dced13cb797d3c602cc1ce71a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 24 Jul 2021 20:38:28 -0400 Subject: [PATCH 11/11] Avoid seen na alloc, fix multicol case --- pandas/_libs/groupby.pyx | 22 +++++++++++++--------- pandas/tests/groupby/test_function.py | 12 ++++++++++++ 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f916a5c8b755a..91921ba0e64c2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1380,19 +1380,23 @@ cdef cummin_max(groupby_t[:, ::1] out, cdef: Py_ssize_t i, j, N, K groupby_t val, mval, na_val - uint8_t[::1] seen_na + uint8_t[:, ::1] seen_na intp_t lab + bint na_possible if groupby_t is float64_t or groupby_t is float32_t: na_val = NaN + na_possible = True elif is_datetimelike: na_val = NPY_NAT + na_possible = True # Will never be used, just to avoid uninitialized warning else: na_val = 0 + na_possible = False - N, K = (values).shape - seen_na = np.zeros((accum).shape[0], dtype=np.uint8) + if na_possible: + seen_na = np.zeros((accum).shape, dtype=np.uint8) N, K = (values).shape with nogil: @@ -1401,7 +1405,7 @@ cdef cummin_max(groupby_t[:, ::1] out, if lab < 0: continue for j in range(K): - if not skipna and seen_na[lab]: + if not skipna and na_possible and seen_na[lab, j]: out[i, j] = na_val else: val = values[i, j] @@ -1415,7 +1419,7 @@ cdef cummin_max(groupby_t[:, ::1] out, accum[lab, j] = mval = val out[i, j] = mval else: - seen_na[lab] = 1 + seen_na[lab, j] = 1 out[i, j] = val @@ -1435,18 +1439,18 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, cdef: Py_ssize_t i, j, N, K groupby_t val, mval - uint8_t[::1] seen_na + uint8_t[:, ::1] seen_na intp_t lab N, K = (values).shape - seen_na = np.zeros((accum).shape[0], dtype=np.uint8) + seen_na = np.zeros((accum).shape, dtype=np.uint8) with nogil: for i in range(N): lab = labels[i] if lab < 0: continue for j in range(K): - if not skipna and seen_na[lab]: + if not skipna and seen_na[lab, j]: mask[i, j] = 1 else: if not mask[i, j]: @@ -1460,7 +1464,7 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, accum[lab, j] = mval = val out[i, j] = mval else: - seen_na[lab] = 1 + seen_na[lab, j] = 1 @cython.boundscheck(False) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 161e4e9cd67f4..77e5e9ba133f5 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -824,6 +824,18 @@ def test_cummin_max_skipna(method, dtype, groups, expected_data): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +def test_cummin_max_skipna_multiple_cols(method): + # Ensure missing value in "a" doesn't cause "b" to be nan-filled + df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) + gb = df.groupby([1, 1, 1])[["a", "b"]] + + result = getattr(gb, method)(skipna=False) + expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_32bit @pytest.mark.parametrize("method", ["cummin", "cummax"]) @pytest.mark.parametrize(