Skip to content

BUG/ENH: group cummin/max handle skipna #41854

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jul 31, 2021
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ enhancement2

Other enhancements
^^^^^^^^^^^^^^^^^^
-
- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
-

.. ---------------------------------------------------------------------------
Expand Down
82 changes: 57 additions & 25 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1317,6 +1317,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
bint skipna,
bint compute_max):
"""
Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
Expand All @@ -1336,6 +1337,8 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
Number of groups, larger than all entries of `labels`.
is_datetimelike : bool
True if `values` contains datetime-like entries.
skipna : bool
If True, ignore nans in `values`.
compute_max : bool
True if cumulative maximum should be computed, False
if cumulative minimum should be computed
Expand All @@ -1356,9 +1359,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
accum[:] = -np.inf if compute_max else np.inf

if mask is not None:
masked_cummin_max(out, values, mask, labels, accum, compute_max)
masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max)
else:
cummin_max(out, values, labels, accum, is_datetimelike, compute_max)
cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max)


@cython.boundscheck(False)
Expand All @@ -1367,6 +1370,7 @@ cdef cummin_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const intp_t[:] labels,
groupby_t[:, ::1] accum,
bint skipna,
bint is_datetimelike,
bint compute_max):
"""
Expand All @@ -1375,28 +1379,44 @@ cdef cummin_max(groupby_t[:, ::1] out,
"""
cdef:
Py_ssize_t i, j, N, K
groupby_t val, mval
groupby_t val, mval, na_val
uint8_t[::1] seen_na
intp_t lab

if groupby_t is float64_t or groupby_t is float32_t:
na_val = NaN
elif is_datetimelike:
na_val = NPY_NAT
# Will never be used, just to avoid uninitialized warning
else:
na_val = 0

N, K = (<object>values).shape
seen_na = np.zeros((<object>accum).shape[0], dtype=np.uint8)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we avoid allocating in some cases?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have changed to only allocate if a missing value is possible. Could avoid more aggressively perhaps (eg only as soon as a missing value is seen), see comment for the masked case


N, K = (<object>values).shape
with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue
for j in range(K):
val = values[i, j]
if not _treat_as_na(val, is_datetimelike):
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
if not skipna and seen_na[lab]:
out[i, j] = na_val
else:
out[i, j] = val
val = values[i, j]
if not _treat_as_na(val, is_datetimelike):
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
else:
seen_na[lab] = 1
out[i, j] = val


@cython.boundscheck(False)
Expand All @@ -1406,6 +1426,7 @@ cdef masked_cummin_max(groupby_t[:, ::1] out,
uint8_t[:, ::1] mask,
const intp_t[:] labels,
groupby_t[:, ::1] accum,
bint skipna,
bint compute_max):
"""
Compute the cumulative minimum/maximum of columns of `values`, in row groups
Expand All @@ -1414,25 +1435,32 @@ cdef masked_cummin_max(groupby_t[:, ::1] out,
cdef:
Py_ssize_t i, j, N, K
groupby_t val, mval
uint8_t[::1] seen_na
intp_t lab

N, K = (<object>values).shape
seen_na = np.zeros((<object>accum).shape[0], dtype=np.uint8)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we avoid initializing this in some cases?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure here since we can't tell if a missing value is present without looking through the whole array. I suppose we could only allocate as soon as a missing value is seen, but that would add some complexity since algo is wrapped in a nogil.

Hopefully this is not a big memory addition since it is ngroups * ncols bytes vs the nrow * ncols * 4 for the values (since groupby_t only covers (int64/float64/32)

with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue
for j in range(K):
if not mask[i, j]:
val = values[i, j]
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val
if not skipna and seen_na[lab]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Noticing duplication b/t this and the non-masked version above. in a recent PR to add masked support for min/max i found that combining the two functions (and so doing a check for mask is None inside the loop) didn't make a noticeable perf difference)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed about the duplication not being great - split these in #41853 because when I originally tried to make this change with one combined algo, the logic ended up getting pretty convoluted, so I thought the tradeoff was worth it (but agreed about perf, saw no perf improvement from avoiding the mask is None check in #41853)

I'd hope that at some point when more masked algos are supported, it'll make sense to keep only the masked kernels (and compute the mask outside, pass it in regardless of type). This would help avoid some of ugliness around determining nulls with stuff like datetimelike.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense, better to de-duplicate in a dedicated pass anyway

mask[i, j] = 1
else:
if not mask[i, j]:
val = values[i, j]
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
seen_na[lab] = 1


@cython.boundscheck(False)
Expand All @@ -1442,7 +1470,8 @@ def group_cummin(groupby_t[:, ::1] out,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
uint8_t[:, ::1] mask=None) -> None:
uint8_t[:, ::1] mask=None,
bint skipna=True) -> None:
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
Expand All @@ -1451,6 +1480,7 @@ def group_cummin(groupby_t[:, ::1] out,
labels,
ngroups,
is_datetimelike,
skipna,
compute_max=False
)

Expand All @@ -1462,7 +1492,8 @@ def group_cummax(groupby_t[:, ::1] out,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
uint8_t[:, ::1] mask=None) -> None:
uint8_t[:, ::1] mask=None,
bint skipna=True) -> None:
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
Expand All @@ -1471,5 +1502,6 @@ def group_cummax(groupby_t[:, ::1] out,
labels,
ngroups,
is_datetimelike,
skipna,
compute_max=True
)
6 changes: 4 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2737,10 +2737,11 @@ def cummin(self, axis=0, **kwargs):
-------
Series or DataFrame
"""
skipna = kwargs.get("skipna", True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this go in the signature explicitly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some context in #41854 (comment).

Happy to make that change here, was planning on doing in followup to make skipna, numeric_only consistent for cumsum, cummin/max

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah i agree we want this in the signature, can be a followup is ok.

if axis != 0:
return self.apply(lambda x: np.minimum.accumulate(x, axis))

return self._cython_transform("cummin", numeric_only=False)
return self._cython_transform("cummin", numeric_only=False, skipna=skipna)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally cummin/max could handle kwargs the same way as cumsum does, but the problem is the forced numeric_only=False.

As a followup change, it would be nice to remove this inconsistency (especially since numeric_only=True would just be ignored). Would this require a deprecation cycle, or could current behavior of ignoring numeric_only=True be considered a bug?

Making this change could enable actually exposing skipna and numeric_only in the signatures of these cumulative functions, instead of having docs that just show *args, **kwargs

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or could current behavior of ignoring numeric_only=True be considered a bug?

I think so, yes.

Making this change could enable actually exposing skipna and numeric_only in the signatures of these cumulative functions, instead of having docs that just show *args, **kwargs

Yah, looks like we currently silently ignore args/kwargs, which isnt great.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok sounds good, will plan to leave as followon since should be an independent improvement from these changes


@final
@Substitution(name="groupby")
Expand All @@ -2753,10 +2754,11 @@ def cummax(self, axis=0, **kwargs):
-------
Series or DataFrame
"""
skipna = kwargs.get("skipna", True)
if axis != 0:
return self.apply(lambda x: np.maximum.accumulate(x, axis))

return self._cython_transform("cummax", numeric_only=False)
return self._cython_transform("cummax", numeric_only=False, skipna=skipna)

@final
def _get_cythonized_result(
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,24 @@ def test_cummax(dtypes_for_minmax):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
@pytest.mark.parametrize(
"groups,expected_data",
[
([1, 1, 1], [1, None, None]),
([1, 2, 3], [1, None, 2]),
([1, 3, 3], [1, None, None]),
],
)
def test_cummin_max_skipna(method, dtype, groups, expected_data):
# GH-34047
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
result = getattr(df.groupby(groups)["a"], method)(skipna=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: can you do this on multiple lines, e.g. gb = df.groupby(...)? it makes it more obvious to me what behavior is actually being tested

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, agreed that is clearer

expected = Series(expected_data, dtype=dtype, name="a")
tm.assert_series_equal(result, expected)


@td.skip_if_32bit
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize(
Expand Down