Skip to content

BUG/ENH: group cummin/max handle skipna #41854

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jul 31, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Other enhancements
- :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
- Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`)
- Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`)
- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
-

.. ---------------------------------------------------------------------------
Expand Down
86 changes: 61 additions & 25 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1317,6 +1317,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
bint skipna,
bint compute_max):
"""
Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
Expand All @@ -1336,6 +1337,8 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
Number of groups, larger than all entries of `labels`.
is_datetimelike : bool
True if `values` contains datetime-like entries.
skipna : bool
If True, ignore nans in `values`.
compute_max : bool
True if cumulative maximum should be computed, False
if cumulative minimum should be computed
Expand All @@ -1356,9 +1359,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
accum[:] = -np.inf if compute_max else np.inf

if mask is not None:
masked_cummin_max(out, values, mask, labels, accum, compute_max)
masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max)
else:
cummin_max(out, values, labels, accum, is_datetimelike, compute_max)
cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max)


@cython.boundscheck(False)
Expand All @@ -1367,6 +1370,7 @@ cdef cummin_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const intp_t[:] labels,
groupby_t[:, ::1] accum,
bint skipna,
bint is_datetimelike,
bint compute_max):
"""
Expand All @@ -1375,8 +1379,24 @@ cdef cummin_max(groupby_t[:, ::1] out,
"""
cdef:
Py_ssize_t i, j, N, K
groupby_t val, mval
groupby_t val, mval, na_val
uint8_t[:, ::1] seen_na
intp_t lab
bint na_possible

if groupby_t is float64_t or groupby_t is float32_t:
na_val = NaN
na_possible = True
elif is_datetimelike:
na_val = NPY_NAT
na_possible = True
# Will never be used, just to avoid uninitialized warning
else:
na_val = 0
na_possible = False

if na_possible:
seen_na = np.zeros((<object>accum).shape, dtype=np.uint8)

N, K = (<object>values).shape
with nogil:
Expand All @@ -1385,18 +1405,22 @@ cdef cummin_max(groupby_t[:, ::1] out,
if lab < 0:
continue
for j in range(K):
val = values[i, j]
if not _treat_as_na(val, is_datetimelike):
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
if not skipna and na_possible and seen_na[lab, j]:
out[i, j] = na_val
else:
out[i, j] = val
val = values[i, j]
if not _treat_as_na(val, is_datetimelike):
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
else:
seen_na[lab, j] = 1
out[i, j] = val


@cython.boundscheck(False)
Expand All @@ -1406,6 +1430,7 @@ cdef masked_cummin_max(groupby_t[:, ::1] out,
uint8_t[:, ::1] mask,
const intp_t[:] labels,
groupby_t[:, ::1] accum,
bint skipna,
bint compute_max):
"""
Compute the cumulative minimum/maximum of columns of `values`, in row groups
Expand All @@ -1414,25 +1439,32 @@ cdef masked_cummin_max(groupby_t[:, ::1] out,
cdef:
Py_ssize_t i, j, N, K
groupby_t val, mval
uint8_t[:, ::1] seen_na
intp_t lab

N, K = (<object>values).shape
seen_na = np.zeros((<object>accum).shape, dtype=np.uint8)
with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue
for j in range(K):
if not mask[i, j]:
val = values[i, j]
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val
if not skipna and seen_na[lab, j]:
mask[i, j] = 1
else:
if not mask[i, j]:
val = values[i, j]
mval = accum[lab, j]
if compute_max:
if val > mval:
accum[lab, j] = mval = val
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
else:
if val < mval:
accum[lab, j] = mval = val
out[i, j] = mval
seen_na[lab, j] = 1


@cython.boundscheck(False)
Expand All @@ -1442,7 +1474,8 @@ def group_cummin(groupby_t[:, ::1] out,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
uint8_t[:, ::1] mask=None) -> None:
uint8_t[:, ::1] mask=None,
bint skipna=True) -> None:
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
Expand All @@ -1451,6 +1484,7 @@ def group_cummin(groupby_t[:, ::1] out,
labels,
ngroups,
is_datetimelike,
skipna,
compute_max=False
)

Expand All @@ -1462,7 +1496,8 @@ def group_cummax(groupby_t[:, ::1] out,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
uint8_t[:, ::1] mask=None) -> None:
uint8_t[:, ::1] mask=None,
bint skipna=True) -> None:
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
Expand All @@ -1471,5 +1506,6 @@ def group_cummax(groupby_t[:, ::1] out,
labels,
ngroups,
is_datetimelike,
skipna,
compute_max=True
)
6 changes: 4 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2784,10 +2784,11 @@ def cummin(self, axis=0, **kwargs):
-------
Series or DataFrame
"""
skipna = kwargs.get("skipna", True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this go in the signature explicitly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some context in #41854 (comment).

Happy to make that change here, was planning on doing in followup to make skipna, numeric_only consistent for cumsum, cummin/max

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah i agree we want this in the signature, can be a followup is ok.

if axis != 0:
return self.apply(lambda x: np.minimum.accumulate(x, axis))

return self._cython_transform("cummin", numeric_only=False)
return self._cython_transform("cummin", numeric_only=False, skipna=skipna)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally cummin/max could handle kwargs the same way as cumsum does, but the problem is the forced numeric_only=False.

As a followup change, it would be nice to remove this inconsistency (especially since numeric_only=True would just be ignored). Would this require a deprecation cycle, or could current behavior of ignoring numeric_only=True be considered a bug?

Making this change could enable actually exposing skipna and numeric_only in the signatures of these cumulative functions, instead of having docs that just show *args, **kwargs

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or could current behavior of ignoring numeric_only=True be considered a bug?

I think so, yes.

Making this change could enable actually exposing skipna and numeric_only in the signatures of these cumulative functions, instead of having docs that just show *args, **kwargs

Yah, looks like we currently silently ignore args/kwargs, which isnt great.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok sounds good, will plan to leave as followon since should be an independent improvement from these changes


@final
@Substitution(name="groupby")
Expand All @@ -2800,10 +2801,11 @@ def cummax(self, axis=0, **kwargs):
-------
Series or DataFrame
"""
skipna = kwargs.get("skipna", True)
if axis != 0:
return self.apply(lambda x: np.maximum.accumulate(x, axis))

return self._cython_transform("cummax", numeric_only=False)
return self._cython_transform("cummax", numeric_only=False, skipna=skipna)

@final
def _get_cythonized_result(
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,39 @@ def test_cummax(dtypes_for_minmax):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
@pytest.mark.parametrize(
"groups,expected_data",
[
([1, 1, 1], [1, None, None]),
([1, 2, 3], [1, None, 2]),
([1, 3, 3], [1, None, None]),
],
)
def test_cummin_max_skipna(method, dtype, groups, expected_data):
# GH-34047
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
gb = df.groupby(groups)["a"]

result = getattr(gb, method)(skipna=False)
expected = Series(expected_data, dtype=dtype, name="a")

tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["cummin", "cummax"])
def test_cummin_max_skipna_multiple_cols(method):
# Ensure missing value in "a" doesn't cause "b" to be nan-filled
df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
gb = df.groupby([1, 1, 1])[["a", "b"]]

result = getattr(gb, method)(skipna=False)
expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})

tm.assert_frame_equal(result, expected)


@td.skip_if_32bit
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize(
Expand Down