-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG/ENH: group cummin/max handle skipna #41854
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
97d7e11
ee7a7ef
885f8ae
51f8f9c
d7cf5b5
21e5157
7858fc6
3c869ce
e8a08bf
c448837
9333d58
1aa3586
856a1b2
dfe8b73
22bef11
0d57fc0
5ccdf94
fb76640
a3db4de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1317,6 +1317,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out, | |
const intp_t[:] labels, | ||
int ngroups, | ||
bint is_datetimelike, | ||
bint skipna, | ||
bint compute_max): | ||
""" | ||
Cumulative minimum/maximum of columns of `values`, in row groups `labels`. | ||
|
@@ -1336,6 +1337,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, | |
Number of groups, larger than all entries of `labels`. | ||
is_datetimelike : bool | ||
True if `values` contains datetime-like entries. | ||
skipna : bool | ||
If True, ignore nans in `values`. | ||
|
||
compute_max : bool | ||
True if cumulative maximum should be computed, False | ||
if cumulative minimum should be computed | ||
|
@@ -1345,61 +1349,114 @@ cdef group_cummin_max(groupby_t[:, ::1] out, | |
This method modifies the `out` parameter, rather than returning an object. | ||
""" | ||
cdef: | ||
Py_ssize_t i, j, N, K, size | ||
groupby_t val, mval | ||
groupby_t[:, ::1] accum | ||
intp_t lab | ||
bint val_is_nan, use_mask | ||
|
||
use_mask = mask is not None | ||
|
||
N, K = (<object>values).shape | ||
accum = np.empty((ngroups, K), dtype=values.dtype) | ||
accum = np.empty((ngroups, (<object>values).shape[1]), dtype=values.dtype) | ||
if groupby_t is int64_t: | ||
accum[:] = -_int64_max if compute_max else _int64_max | ||
elif groupby_t is uint64_t: | ||
accum[:] = 0 if compute_max else np.iinfo(np.uint64).max | ||
else: | ||
accum[:] = -np.inf if compute_max else np.inf | ||
|
||
if mask is not None: | ||
masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max) | ||
else: | ||
cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max) | ||
|
||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
cdef cummin_max(groupby_t[:, ::1] out, | ||
ndarray[groupby_t, ndim=2] values, | ||
const intp_t[:] labels, | ||
groupby_t[:, ::1] accum, | ||
bint skipna, | ||
bint is_datetimelike, | ||
bint compute_max): | ||
""" | ||
Compute the cumulative minimum/maximum of columns of `values`, in row groups | ||
`labels`. | ||
""" | ||
cdef: | ||
Py_ssize_t i, j, N, K | ||
groupby_t val, mval, na_val | ||
uint8_t[::1] seen_na | ||
intp_t lab | ||
|
||
if groupby_t is float64_t or groupby_t is float32_t: | ||
na_val = NaN | ||
elif is_datetimelike: | ||
na_val = NPY_NAT | ||
|
||
N, K = (<object>values).shape | ||
seen_na = np.zeros((<object>accum).shape[0], dtype=np.uint8) | ||
with nogil: | ||
for i in range(N): | ||
lab = labels[i] | ||
|
||
if lab < 0: | ||
continue | ||
for j in range(K): | ||
val_is_nan = False | ||
|
||
if use_mask: | ||
if mask[i, j]: | ||
|
||
# `out` does not need to be set since it | ||
# will be masked anyway | ||
val_is_nan = True | ||
else: | ||
|
||
# If using the mask, we can avoid grabbing the | ||
# value unless necessary | ||
val = values[i, j] | ||
|
||
# Otherwise, `out` must be set accordingly if the | ||
# value is missing | ||
if not skipna and seen_na[lab]: | ||
out[i, j] = na_val | ||
else: | ||
val = values[i, j] | ||
if _treat_as_na(val, is_datetimelike): | ||
val_is_nan = True | ||
if not _treat_as_na(val, is_datetimelike): | ||
mval = accum[lab, j] | ||
if compute_max: | ||
if val > mval: | ||
accum[lab, j] = mval = val | ||
else: | ||
if val < mval: | ||
accum[lab, j] = mval = val | ||
out[i, j] = mval | ||
else: | ||
seen_na[lab] = 1 | ||
out[i, j] = val | ||
|
||
if not val_is_nan: | ||
mval = accum[lab, j] | ||
if compute_max: | ||
if val > mval: | ||
accum[lab, j] = mval = val | ||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
cdef masked_cummin_max(groupby_t[:, ::1] out, | ||
ndarray[groupby_t, ndim=2] values, | ||
uint8_t[:, ::1] mask, | ||
const intp_t[:] labels, | ||
groupby_t[:, ::1] accum, | ||
bint skipna, | ||
bint compute_max): | ||
""" | ||
Compute the cumulative minimum/maximum of columns of `values`, in row groups | ||
`labels` with a masked algorithm. | ||
""" | ||
cdef: | ||
Py_ssize_t i, j, N, K | ||
groupby_t val, mval | ||
uint8_t[::1] seen_na | ||
intp_t lab | ||
|
||
N, K = (<object>values).shape | ||
seen_na = np.zeros((<object>accum).shape[0], dtype=np.uint8) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we avoid initializing this in some cases? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure here since we can't tell if a missing value is present without looking through the whole array. I suppose we could only allocate as soon as a missing value is seen, but that would add some complexity since algo is wrapped in a Hopefully this is not a big memory addition since it is |
||
with nogil: | ||
for i in range(N): | ||
lab = labels[i] | ||
if lab < 0: | ||
continue | ||
for j in range(K): | ||
if not skipna and seen_na[lab]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Noticing duplication b/t this and the non-masked version above. in a recent PR to add masked support for min/max i found that combining the two functions (and so doing a check for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed about the duplication not being great - split these in #41853 because when I originally tried to make this change with one combined algo, the logic ended up getting pretty convoluted, so I thought the tradeoff was worth it (but agreed about perf, saw no perf improvement from avoiding the I'd hope that at some point when more masked algos are supported, it'll make sense to keep only the masked kernels (and compute the mask outside, pass it in regardless of type). This would help avoid some of ugliness around determining nulls with stuff like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. makes sense, better to de-duplicate in a dedicated pass anyway |
||
mask[i, j] = 1 | ||
else: | ||
if not mask[i, j]: | ||
val = values[i, j] | ||
mval = accum[lab, j] | ||
if compute_max: | ||
if val > mval: | ||
accum[lab, j] = mval = val | ||
else: | ||
if val < mval: | ||
accum[lab, j] = mval = val | ||
out[i, j] = mval | ||
else: | ||
if val < mval: | ||
accum[lab, j] = mval = val | ||
out[i, j] = mval | ||
seen_na[lab] = 1 | ||
|
||
|
||
@cython.boundscheck(False) | ||
|
@@ -1409,7 +1466,8 @@ def group_cummin(groupby_t[:, ::1] out, | |
const intp_t[:] labels, | ||
int ngroups, | ||
bint is_datetimelike, | ||
uint8_t[:, ::1] mask=None) -> None: | ||
uint8_t[:, ::1] mask=None, | ||
bint skipna=True) -> None: | ||
"""See group_cummin_max.__doc__""" | ||
group_cummin_max( | ||
out, | ||
|
@@ -1418,6 +1476,7 @@ def group_cummin(groupby_t[:, ::1] out, | |
labels, | ||
ngroups, | ||
is_datetimelike, | ||
skipna, | ||
compute_max=False | ||
) | ||
|
||
|
@@ -1429,7 +1488,8 @@ def group_cummax(groupby_t[:, ::1] out, | |
const intp_t[:] labels, | ||
int ngroups, | ||
bint is_datetimelike, | ||
uint8_t[:, ::1] mask=None) -> None: | ||
uint8_t[:, ::1] mask=None, | ||
bint skipna=True) -> None: | ||
"""See group_cummin_max.__doc__""" | ||
group_cummin_max( | ||
out, | ||
|
@@ -1438,5 +1498,6 @@ def group_cummax(groupby_t[:, ::1] out, | |
labels, | ||
ngroups, | ||
is_datetimelike, | ||
skipna, | ||
compute_max=True | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2737,10 +2737,11 @@ def cummin(self, axis=0, **kwargs): | |
------- | ||
Series or DataFrame | ||
""" | ||
skipna = kwargs.get("skipna", True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can this go in the signature explicitly? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some context in #41854 (comment). Happy to make that change here, was planning on doing in followup to make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah i agree we want this in the signature, can be a followup is ok. |
||
if axis != 0: | ||
return self.apply(lambda x: np.minimum.accumulate(x, axis)) | ||
|
||
return self._cython_transform("cummin", numeric_only=False) | ||
return self._cython_transform("cummin", numeric_only=False, skipna=skipna) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally As a followup change, it would be nice to remove this inconsistency (especially since Making this change could enable actually exposing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think so, yes.
Yah, looks like we currently silently ignore args/kwargs, which isnt great. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok sounds good, will plan to leave as followon since should be an independent improvement from these changes |
||
|
||
@final | ||
@Substitution(name="groupby") | ||
|
@@ -2753,10 +2754,11 @@ def cummax(self, axis=0, **kwargs): | |
------- | ||
Series or DataFrame | ||
""" | ||
skipna = kwargs.get("skipna", True) | ||
if axis != 0: | ||
return self.apply(lambda x: np.maximum.accumulate(x, axis)) | ||
|
||
return self._cython_transform("cummax", numeric_only=False) | ||
return self._cython_transform("cummax", numeric_only=False, skipna=skipna) | ||
|
||
@final | ||
def _get_cythonized_result( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -904,6 +904,24 @@ def test_cummax(dtypes_for_minmax): | |
tm.assert_series_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize("method", ["cummin", "cummax"]) | ||
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) | ||
@pytest.mark.parametrize( | ||
"groups,expected_data", | ||
[ | ||
([1, 1, 1], [1, None, None]), | ||
([1, 2, 3], [1, None, 2]), | ||
([1, 3, 3], [1, None, None]), | ||
], | ||
) | ||
def test_cummin_max_skipna(method, dtype, groups, expected_data): | ||
# GH-34047 | ||
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) | ||
result = getattr(df.groupby(groups)["a"], method)(skipna=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick: can you do this on multiple lines, e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, agreed that is clearer |
||
expected = Series(expected_data, dtype=dtype, name="a") | ||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
@td.skip_if_32bit | ||
@pytest.mark.parametrize("method", ["cummin", "cummax"]) | ||
@pytest.mark.parametrize( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we avoid allocating in some cases?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have changed to only allocate if a missing value is possible. Could avoid more aggressively perhaps (eg only as soon as a missing value is seen), see comment for the masked case