diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index b72b927b3c2a8..0e0598c3264e8 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1345,16 +1345,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, This method modifies the `out` parameter, rather than returning an object. """ cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval groupby_t[:, ::1] accum - intp_t lab - bint val_is_nan, use_mask - use_mask = mask is not None - - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=values.dtype) + accum = np.empty((ngroups, (values).shape[1]), dtype=values.dtype) if groupby_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max elif groupby_t is uint64_t: @@ -1362,36 +1355,76 @@ cdef group_cummin_max(groupby_t[:, ::1] out, else: accum[:] = -np.inf if compute_max else np.inf + if mask is not None: + masked_cummin_max(out, values, mask, labels, accum, compute_max) + else: + cummin_max(out, values, labels, accum, is_datetimelike, compute_max) + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef cummin_max(groupby_t[:, ::1] out, + ndarray[groupby_t, ndim=2] values, + const intp_t[:] labels, + groupby_t[:, ::1] accum, + bint is_datetimelike, + bint compute_max): + """ + Compute the cumulative minimum/maximum of columns of `values`, in row groups + `labels`. + """ + cdef: + Py_ssize_t i, j, N, K + groupby_t val, mval + intp_t lab + + N, K = (values).shape with nogil: for i in range(N): lab = labels[i] - if lab < 0: continue for j in range(K): - val_is_nan = False - - if use_mask: - if mask[i, j]: - - # `out` does not need to be set since it - # will be masked anyway - val_is_nan = True + val = values[i, j] + if not _treat_as_na(val, is_datetimelike): + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val else: + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval + else: + out[i, j] = val - # If using the mask, we can avoid grabbing the - # value unless necessary - val = values[i, j] - # Otherwise, `out` must be set accordingly if the - # value is missing - else: - val = values[i, j] - if _treat_as_na(val, is_datetimelike): - val_is_nan = True - out[i, j] = val +@cython.boundscheck(False) +@cython.wraparound(False) +cdef masked_cummin_max(groupby_t[:, ::1] out, + ndarray[groupby_t, ndim=2] values, + uint8_t[:, ::1] mask, + const intp_t[:] labels, + groupby_t[:, ::1] accum, + bint compute_max): + """ + Compute the cumulative minimum/maximum of columns of `values`, in row groups + `labels` with a masked algorithm. + """ + cdef: + Py_ssize_t i, j, N, K + groupby_t val, mval + intp_t lab - if not val_is_nan: + N, K = (values).shape + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + for j in range(K): + if not mask[i, j]: + val = values[i, j] mval = accum[lab, j] if compute_max: if val > mval: