Skip to content

Commit d0eba0a

Browse files
authored
BUG/ENH: group cummin/max handle skipna (pandas-dev#41854)
1 parent d4eb667 commit d0eba0a

File tree

4 files changed

+99
-27
lines changed

4 files changed

+99
-27
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Other enhancements
3737
- :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
3838
- Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`)
3939
- Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`)
40+
- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
4041
-
4142

4243
.. ---------------------------------------------------------------------------

pandas/_libs/groupby.pyx

+61-25
Original file line numberDiff line numberDiff line change
@@ -1317,6 +1317,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
13171317
const intp_t[:] labels,
13181318
int ngroups,
13191319
bint is_datetimelike,
1320+
bint skipna,
13201321
bint compute_max):
13211322
"""
13221323
Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
@@ -1336,6 +1337,8 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
13361337
Number of groups, larger than all entries of `labels`.
13371338
is_datetimelike : bool
13381339
True if `values` contains datetime-like entries.
1340+
skipna : bool
1341+
If True, ignore nans in `values`.
13391342
compute_max : bool
13401343
True if cumulative maximum should be computed, False
13411344
if cumulative minimum should be computed
@@ -1356,9 +1359,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
13561359
accum[:] = -np.inf if compute_max else np.inf
13571360

13581361
if mask is not None:
1359-
masked_cummin_max(out, values, mask, labels, accum, compute_max)
1362+
masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max)
13601363
else:
1361-
cummin_max(out, values, labels, accum, is_datetimelike, compute_max)
1364+
cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max)
13621365

13631366

13641367
@cython.boundscheck(False)
@@ -1367,6 +1370,7 @@ cdef cummin_max(groupby_t[:, ::1] out,
13671370
ndarray[groupby_t, ndim=2] values,
13681371
const intp_t[:] labels,
13691372
groupby_t[:, ::1] accum,
1373+
bint skipna,
13701374
bint is_datetimelike,
13711375
bint compute_max):
13721376
"""
@@ -1375,8 +1379,24 @@ cdef cummin_max(groupby_t[:, ::1] out,
13751379
"""
13761380
cdef:
13771381
Py_ssize_t i, j, N, K
1378-
groupby_t val, mval
1382+
groupby_t val, mval, na_val
1383+
uint8_t[:, ::1] seen_na
13791384
intp_t lab
1385+
bint na_possible
1386+
1387+
if groupby_t is float64_t or groupby_t is float32_t:
1388+
na_val = NaN
1389+
na_possible = True
1390+
elif is_datetimelike:
1391+
na_val = NPY_NAT
1392+
na_possible = True
1393+
# Will never be used, just to avoid uninitialized warning
1394+
else:
1395+
na_val = 0
1396+
na_possible = False
1397+
1398+
if na_possible:
1399+
seen_na = np.zeros((<object>accum).shape, dtype=np.uint8)
13801400

13811401
N, K = (<object>values).shape
13821402
with nogil:
@@ -1385,18 +1405,22 @@ cdef cummin_max(groupby_t[:, ::1] out,
13851405
if lab < 0:
13861406
continue
13871407
for j in range(K):
1388-
val = values[i, j]
1389-
if not _treat_as_na(val, is_datetimelike):
1390-
mval = accum[lab, j]
1391-
if compute_max:
1392-
if val > mval:
1393-
accum[lab, j] = mval = val
1394-
else:
1395-
if val < mval:
1396-
accum[lab, j] = mval = val
1397-
out[i, j] = mval
1408+
if not skipna and na_possible and seen_na[lab, j]:
1409+
out[i, j] = na_val
13981410
else:
1399-
out[i, j] = val
1411+
val = values[i, j]
1412+
if not _treat_as_na(val, is_datetimelike):
1413+
mval = accum[lab, j]
1414+
if compute_max:
1415+
if val > mval:
1416+
accum[lab, j] = mval = val
1417+
else:
1418+
if val < mval:
1419+
accum[lab, j] = mval = val
1420+
out[i, j] = mval
1421+
else:
1422+
seen_na[lab, j] = 1
1423+
out[i, j] = val
14001424

14011425

14021426
@cython.boundscheck(False)
@@ -1406,6 +1430,7 @@ cdef masked_cummin_max(groupby_t[:, ::1] out,
14061430
uint8_t[:, ::1] mask,
14071431
const intp_t[:] labels,
14081432
groupby_t[:, ::1] accum,
1433+
bint skipna,
14091434
bint compute_max):
14101435
"""
14111436
Compute the cumulative minimum/maximum of columns of `values`, in row groups
@@ -1414,25 +1439,32 @@ cdef masked_cummin_max(groupby_t[:, ::1] out,
14141439
cdef:
14151440
Py_ssize_t i, j, N, K
14161441
groupby_t val, mval
1442+
uint8_t[:, ::1] seen_na
14171443
intp_t lab
14181444

14191445
N, K = (<object>values).shape
1446+
seen_na = np.zeros((<object>accum).shape, dtype=np.uint8)
14201447
with nogil:
14211448
for i in range(N):
14221449
lab = labels[i]
14231450
if lab < 0:
14241451
continue
14251452
for j in range(K):
1426-
if not mask[i, j]:
1427-
val = values[i, j]
1428-
mval = accum[lab, j]
1429-
if compute_max:
1430-
if val > mval:
1431-
accum[lab, j] = mval = val
1453+
if not skipna and seen_na[lab, j]:
1454+
mask[i, j] = 1
1455+
else:
1456+
if not mask[i, j]:
1457+
val = values[i, j]
1458+
mval = accum[lab, j]
1459+
if compute_max:
1460+
if val > mval:
1461+
accum[lab, j] = mval = val
1462+
else:
1463+
if val < mval:
1464+
accum[lab, j] = mval = val
1465+
out[i, j] = mval
14321466
else:
1433-
if val < mval:
1434-
accum[lab, j] = mval = val
1435-
out[i, j] = mval
1467+
seen_na[lab, j] = 1
14361468

14371469

14381470
@cython.boundscheck(False)
@@ -1442,7 +1474,8 @@ def group_cummin(groupby_t[:, ::1] out,
14421474
const intp_t[:] labels,
14431475
int ngroups,
14441476
bint is_datetimelike,
1445-
uint8_t[:, ::1] mask=None) -> None:
1477+
uint8_t[:, ::1] mask=None,
1478+
bint skipna=True) -> None:
14461479
"""See group_cummin_max.__doc__"""
14471480
group_cummin_max(
14481481
out,
@@ -1451,6 +1484,7 @@ def group_cummin(groupby_t[:, ::1] out,
14511484
labels,
14521485
ngroups,
14531486
is_datetimelike,
1487+
skipna,
14541488
compute_max=False
14551489
)
14561490

@@ -1462,7 +1496,8 @@ def group_cummax(groupby_t[:, ::1] out,
14621496
const intp_t[:] labels,
14631497
int ngroups,
14641498
bint is_datetimelike,
1465-
uint8_t[:, ::1] mask=None) -> None:
1499+
uint8_t[:, ::1] mask=None,
1500+
bint skipna=True) -> None:
14661501
"""See group_cummin_max.__doc__"""
14671502
group_cummin_max(
14681503
out,
@@ -1471,5 +1506,6 @@ def group_cummax(groupby_t[:, ::1] out,
14711506
labels,
14721507
ngroups,
14731508
is_datetimelike,
1509+
skipna,
14741510
compute_max=True
14751511
)

pandas/core/groupby/groupby.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -2784,10 +2784,11 @@ def cummin(self, axis=0, **kwargs):
27842784
-------
27852785
Series or DataFrame
27862786
"""
2787+
skipna = kwargs.get("skipna", True)
27872788
if axis != 0:
27882789
return self.apply(lambda x: np.minimum.accumulate(x, axis))
27892790

2790-
return self._cython_transform("cummin", numeric_only=False)
2791+
return self._cython_transform("cummin", numeric_only=False, skipna=skipna)
27912792

27922793
@final
27932794
@Substitution(name="groupby")
@@ -2800,10 +2801,11 @@ def cummax(self, axis=0, **kwargs):
28002801
-------
28012802
Series or DataFrame
28022803
"""
2804+
skipna = kwargs.get("skipna", True)
28032805
if axis != 0:
28042806
return self.apply(lambda x: np.maximum.accumulate(x, axis))
28052807

2806-
return self._cython_transform("cummax", numeric_only=False)
2808+
return self._cython_transform("cummax", numeric_only=False, skipna=skipna)
28072809

28082810
@final
28092811
def _get_cythonized_result(

pandas/tests/groupby/test_function.py

+33
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,39 @@ def test_cummax(dtypes_for_minmax):
803803
tm.assert_series_equal(result, expected)
804804

805805

806+
@pytest.mark.parametrize("method", ["cummin", "cummax"])
807+
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
808+
@pytest.mark.parametrize(
809+
"groups,expected_data",
810+
[
811+
([1, 1, 1], [1, None, None]),
812+
([1, 2, 3], [1, None, 2]),
813+
([1, 3, 3], [1, None, None]),
814+
],
815+
)
816+
def test_cummin_max_skipna(method, dtype, groups, expected_data):
817+
# GH-34047
818+
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
819+
gb = df.groupby(groups)["a"]
820+
821+
result = getattr(gb, method)(skipna=False)
822+
expected = Series(expected_data, dtype=dtype, name="a")
823+
824+
tm.assert_series_equal(result, expected)
825+
826+
827+
@pytest.mark.parametrize("method", ["cummin", "cummax"])
828+
def test_cummin_max_skipna_multiple_cols(method):
829+
# Ensure missing value in "a" doesn't cause "b" to be nan-filled
830+
df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
831+
gb = df.groupby([1, 1, 1])[["a", "b"]]
832+
833+
result = getattr(gb, method)(skipna=False)
834+
expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})
835+
836+
tm.assert_frame_equal(result, expected)
837+
838+
806839
@td.skip_if_32bit
807840
@pytest.mark.parametrize("method", ["cummin", "cummax"])
808841
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)