Skip to content

BUG/ENH: group cummin/max handle skipna #41854

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jul 31, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ Other enhancements
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
- :meth:`.GroupBy.rank` now supports object-dtype data (:issue:`38278`)
- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
Expand Down
135 changes: 98 additions & 37 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1317,6 +1317,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
bint skipna,
bint compute_max):
"""
Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
Expand All @@ -1336,6 +1337,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
Number of groups, larger than all entries of `labels`.
is_datetimelike : bool
True if `values` contains datetime-like entries.
skipna : bool
If True, ignore nans in `values`.

compute_max : bool
True if cumulative maximum should be computed, False
if cumulative minimum should be computed
Expand All @@ -1345,61 +1349,114 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
This method modifies the `out` parameter, rather than returning an object.
"""
cdef:
Py_ssize_t i, j, N, K, size
groupby_t val, mval
groupby_t[:, ::1] accum
intp_t lab
bint val_is_nan, use_mask

use_mask = mask is not None

N, K = (<object>values).shape
accum = np.empty((ngroups, K), dtype=values.dtype)
accum = np.empty((ngroups, (<object>values).shape[1]), dtype=values.dtype)
if groupby_t is int64_t:
accum[:] = -_int64_max if compute_max else _int64_max
elif groupby_t is uint64_t:
accum[:] = 0 if compute_max else np.iinfo(np.uint64).max
else:
accum[:] = -np.inf if compute_max else np.inf

if mask is not None:
masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max)
else:
cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max)


@cython.boundscheck(False)
@cython.wraparound(False)
cdef cummin_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const intp_t[:] labels,
groupby_t[:, ::1] accum,
bint skipna,
bint is_datetimelike,
bint compute_max):
"""
Compute the cumulative minimum/maximum of columns of `values`, in row groups
`labels`.
"""
cdef:
Py_ssize_t i, j, N, K
groupby_t val, mval, na_val
uint8_t[::1] seen_na
intp_t lab

if groupby_t is float64_t or groupby_t is float32_t:
na_val = NaN
elif is_datetimelike:
na_val = NPY_NAT

N, K = (<object>values).shape
seen_na = np.zeros((<object>accum).shape[0], dtype=np.uint8)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we avoid allocating in some cases?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have changed to only allocate if a missing value is possible. Could avoid more aggressively perhaps (eg only as soon as a missing value is seen), see comment for the masked case

with nogil:
for i in range(N):
lab = labels[i]

if lab < 0:
continue
for j in range(K):
val_is_nan = False

if use_mask:
if mask[i, j]:

# `out` does not need to be set since it
# will be masked anyway
val_is_nan = True
else:

# If using the mask, we can avoid grabbing the
# value unless necessary
val = values[i, j]

# Otherwise, `out` must be set accordingly if the
# value is missing
if not skipna and seen_na[lab]:
out[i, j] = na_val
else:
val = values[i, j]
if _treat_as_na(val, is_datetimelike):
val_is_nan = True
if not _treat_as_na(val, is_datetimelike):
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
else:
seen_na[lab] = 1
out[i, j] = val

if not val_is_nan:
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val

@cython.boundscheck(False)
@cython.wraparound(False)
cdef masked_cummin_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
uint8_t[:, ::1] mask,
const intp_t[:] labels,
groupby_t[:, ::1] accum,
bint skipna,
bint compute_max):
"""
Compute the cumulative minimum/maximum of columns of `values`, in row groups
`labels` with a masked algorithm.
"""
cdef:
Py_ssize_t i, j, N, K
groupby_t val, mval
uint8_t[::1] seen_na
intp_t lab

N, K = (<object>values).shape
seen_na = np.zeros((<object>accum).shape[0], dtype=np.uint8)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we avoid initializing this in some cases?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure here since we can't tell if a missing value is present without looking through the whole array. I suppose we could only allocate as soon as a missing value is seen, but that would add some complexity since algo is wrapped in a nogil.

Hopefully this is not a big memory addition since it is ngroups * ncols bytes vs the nrow * ncols * 4 for the values (since groupby_t only covers (int64/float64/32)

with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue
for j in range(K):
if not skipna and seen_na[lab]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Noticing duplication b/t this and the non-masked version above. in a recent PR to add masked support for min/max i found that combining the two functions (and so doing a check for mask is None inside the loop) didn't make a noticeable perf difference)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed about the duplication not being great - split these in #41853 because when I originally tried to make this change with one combined algo, the logic ended up getting pretty convoluted, so I thought the tradeoff was worth it (but agreed about perf, saw no perf improvement from avoiding the mask is None check in #41853)

I'd hope that at some point when more masked algos are supported, it'll make sense to keep only the masked kernels (and compute the mask outside, pass it in regardless of type). This would help avoid some of ugliness around determining nulls with stuff like datetimelike.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense, better to de-duplicate in a dedicated pass anyway

mask[i, j] = 1
else:
if not mask[i, j]:
val = values[i, j]
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
seen_na[lab] = 1


@cython.boundscheck(False)
Expand All @@ -1409,7 +1466,8 @@ def group_cummin(groupby_t[:, ::1] out,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
uint8_t[:, ::1] mask=None) -> None:
uint8_t[:, ::1] mask=None,
bint skipna=True) -> None:
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
Expand All @@ -1418,6 +1476,7 @@ def group_cummin(groupby_t[:, ::1] out,
labels,
ngroups,
is_datetimelike,
skipna,
compute_max=False
)

Expand All @@ -1429,7 +1488,8 @@ def group_cummax(groupby_t[:, ::1] out,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
uint8_t[:, ::1] mask=None) -> None:
uint8_t[:, ::1] mask=None,
bint skipna=True) -> None:
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
Expand All @@ -1438,5 +1498,6 @@ def group_cummax(groupby_t[:, ::1] out,
labels,
ngroups,
is_datetimelike,
skipna,
compute_max=True
)
6 changes: 4 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2737,10 +2737,11 @@ def cummin(self, axis=0, **kwargs):
-------
Series or DataFrame
"""
skipna = kwargs.get("skipna", True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this go in the signature explicitly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some context in #41854 (comment).

Happy to make that change here, was planning on doing in followup to make skipna, numeric_only consistent for cumsum, cummin/max

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah i agree we want this in the signature, can be a followup is ok.

if axis != 0:
return self.apply(lambda x: np.minimum.accumulate(x, axis))

return self._cython_transform("cummin", numeric_only=False)
return self._cython_transform("cummin", numeric_only=False, skipna=skipna)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally cummin/max could handle kwargs the same way as cumsum does, but the problem is the forced numeric_only=False.

As a followup change, it would be nice to remove this inconsistency (especially since numeric_only=True would just be ignored). Would this require a deprecation cycle, or could current behavior of ignoring numeric_only=True be considered a bug?

Making this change could enable actually exposing skipna and numeric_only in the signatures of these cumulative functions, instead of having docs that just show *args, **kwargs

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or could current behavior of ignoring numeric_only=True be considered a bug?

I think so, yes.

Making this change could enable actually exposing skipna and numeric_only in the signatures of these cumulative functions, instead of having docs that just show *args, **kwargs

Yah, looks like we currently silently ignore args/kwargs, which isnt great.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok sounds good, will plan to leave as followon since should be an independent improvement from these changes


@final
@Substitution(name="groupby")
Expand All @@ -2753,10 +2754,11 @@ def cummax(self, axis=0, **kwargs):
-------
Series or DataFrame
"""
skipna = kwargs.get("skipna", True)
if axis != 0:
return self.apply(lambda x: np.maximum.accumulate(x, axis))

return self._cython_transform("cummax", numeric_only=False)
return self._cython_transform("cummax", numeric_only=False, skipna=skipna)

@final
def _get_cythonized_result(
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,24 @@ def test_cummax(dtypes_for_minmax):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
@pytest.mark.parametrize(
"groups,expected_data",
[
([1, 1, 1], [1, None, None]),
([1, 2, 3], [1, None, 2]),
([1, 3, 3], [1, None, None]),
],
)
def test_cummin_max_skipna(method, dtype, groups, expected_data):
# GH-34047
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
result = getattr(df.groupby(groups)["a"], method)(skipna=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: can you do this on multiple lines, e.g. gb = df.groupby(...)? it makes it more obvious to me what behavior is actually being tested

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, agreed that is clearer

expected = Series(expected_data, dtype=dtype, name="a")
tm.assert_series_equal(result, expected)


@td.skip_if_32bit
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize(
Expand Down