Skip to content

REF: split out grouped masked cummin/max algo #41853

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 9, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 61 additions & 28 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1345,53 +1345,86 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
This method modifies the `out` parameter, rather than returning an object.
"""
cdef:
Py_ssize_t i, j, N, K, size
groupby_t val, mval
groupby_t[:, ::1] accum
intp_t lab
bint val_is_nan, use_mask

use_mask = mask is not None

N, K = (<object>values).shape
accum = np.empty((ngroups, K), dtype=values.dtype)
accum = np.empty((ngroups, (<object>values).shape[1]), dtype=values.dtype)
if groupby_t is int64_t:
accum[:] = -_int64_max if compute_max else _int64_max
elif groupby_t is uint64_t:
accum[:] = 0 if compute_max else np.iinfo(np.uint64).max
else:
accum[:] = -np.inf if compute_max else np.inf

if mask is not None:
masked_cummin_max(out, values, mask, labels, accum, compute_max)
else:
cummin_max(out, values, labels, accum, is_datetimelike, compute_max)


@cython.boundscheck(False)
@cython.wraparound(False)
cdef cummin_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const intp_t[:] labels,
groupby_t[:, ::1] accum,
bint is_datetimelike,
bint compute_max):
"""
Compute the cumulative minimum/maximum of columns of `values`, in row groups
`labels`.
"""
cdef:
Py_ssize_t i, j, N, K
groupby_t val, mval
intp_t lab

N, K = (<object>values).shape
with nogil:
for i in range(N):
lab = labels[i]

if lab < 0:
continue
for j in range(K):
val_is_nan = False

if use_mask:
if mask[i, j]:

# `out` does not need to be set since it
# will be masked anyway
val_is_nan = True
val = values[i, j]
if not _treat_as_na(val, is_datetimelike):
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
else:
out[i, j] = val

# If using the mask, we can avoid grabbing the
# value unless necessary
val = values[i, j]

# Otherwise, `out` must be set accordingly if the
# value is missing
else:
val = values[i, j]
if _treat_as_na(val, is_datetimelike):
val_is_nan = True
out[i, j] = val
@cython.boundscheck(False)
@cython.wraparound(False)
cdef masked_cummin_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
uint8_t[:, ::1] mask,
const intp_t[:] labels,
groupby_t[:, ::1] accum,
bint compute_max):
"""
Compute the cumulative minimum/maximum of columns of `values`, in row groups
`labels` with a masked algorithm.
"""
cdef:
Py_ssize_t i, j, N, K
groupby_t val, mval
intp_t lab

if not val_is_nan:
N, K = (<object>values).shape
with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue
for j in range(K):
if not mask[i, j]:
val = values[i, j]
mval = accum[lab, j]
if compute_max:
if val > mval:
Expand Down