Skip to content

Commit f0d1b24

Browse files
committed
PERF: Cythonize Groupby.cummin/cummax (#15048)
pep8, removed args, kwargs add whatsnew + test + changed logic small error in algo Use a more obvious test Fixed algo & test passed
1 parent b895968 commit f0d1b24

File tree

5 files changed

+103
-14
lines changed

5 files changed

+103
-14
lines changed

asv_bench/benchmarks/groupby.py

-3
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,3 @@ def time_shift(self):
690690
def time_transform_dataframe(self):
691691
# GH 12737
692692
self.df_nans.groupby('key').transform('first')
693-
694-
695-

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ Performance Improvements
281281

282282
- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
283283
- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
284-
284+
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
285285

286286

287287
.. _whatsnew_0200.bug_fixes:

pandas/core/groupby.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
'last', 'first',
7676
'head', 'tail', 'median',
7777
'mean', 'sum', 'min', 'max',
78-
'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
78+
'cumcount',
7979
'resample',
8080
'describe',
8181
'rank', 'quantile',
@@ -97,7 +97,8 @@
9797
_dataframe_apply_whitelist = \
9898
_common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
9999

100-
_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift'])
100+
_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift',
101+
'cummin', 'cummax'])
101102

102103

103104
def _groupby_function(name, alias, npfunc, numeric_only=True,
@@ -1415,6 +1416,24 @@ def cumsum(self, axis=0, *args, **kwargs):
14151416

14161417
return self._cython_transform('cumsum')
14171418

1419+
@Substitution(name='groupby')
1420+
@Appender(_doc_template)
1421+
def cummin(self, axis=0):
1422+
"""Cumulative min for each group"""
1423+
if axis != 0:
1424+
return self.apply(lambda x: np.minimum.accumulate(x, axis))
1425+
1426+
return self._cython_transform('cummin')
1427+
1428+
@Substitution(name='groupby')
1429+
@Appender(_doc_template)
1430+
def cummax(self, axis=0):
1431+
"""Cumulative max for each group"""
1432+
if axis != 0:
1433+
return self.apply(lambda x: np.maximum.accumulate(x, axis))
1434+
1435+
return self._cython_transform('cummax')
1436+
14181437
@Substitution(name='groupby')
14191438
@Appender(_doc_template)
14201439
def shift(self, periods=1, freq=None, axis=0):
@@ -1752,6 +1771,8 @@ def get_group_levels(self):
17521771
'transform': {
17531772
'cumprod': 'group_cumprod',
17541773
'cumsum': 'group_cumsum',
1774+
'cummin': 'group_cummin',
1775+
'cummax': 'group_cummax',
17551776
}
17561777
}
17571778

pandas/src/algos_groupby_helper.pxi.in

+66
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,72 @@ def group_cumsum(numeric[:, :] out,
667667
out[i, j] = accum[lab, j]
668668

669669

670+
@cython.boundscheck(False)
671+
@cython.wraparound(False)
672+
def group_cummin(numeric[:, :] out,
673+
numeric[:, :] values,
674+
int64_t[:] labels,
675+
numeric[:, :] accum):
676+
"""
677+
Only transforms on axis=0
678+
"""
679+
cdef:
680+
Py_ssize_t i, j, N, K, size
681+
numeric val, min_val
682+
int64_t lab
683+
684+
N, K = (<object> values).shape
685+
accum = np.empty_like(accum)
686+
accum[:] = _int64_max
687+
688+
with nogil:
689+
for i in range(N):
690+
lab = labels[i]
691+
692+
if lab < 0:
693+
continue
694+
for j in range(K):
695+
val = values[i, j]
696+
if val == val:
697+
if val < accum[lab, j]:
698+
min_val = val
699+
accum[lab, j] = min_val
700+
out[i, j] = accum[lab, j]
701+
702+
703+
@cython.boundscheck(False)
704+
@cython.wraparound(False)
705+
def group_cummax(numeric[:, :] out,
706+
numeric[:, :] values,
707+
int64_t[:] labels,
708+
numeric[:, :] accum):
709+
"""
710+
Only transforms on axis=0
711+
"""
712+
cdef:
713+
Py_ssize_t i, j, N, K, size
714+
numeric val, max_val
715+
int64_t lab
716+
717+
N, K = (<object> values).shape
718+
accum = np.empty_like(accum)
719+
accum[:] = -_int64_max
720+
721+
with nogil:
722+
for i in range(N):
723+
lab = labels[i]
724+
725+
if lab < 0:
726+
continue
727+
for j in range(K):
728+
val = values[i, j]
729+
if val == val:
730+
if val > accum[lab, j]:
731+
max_val = val
732+
accum[lab, j] = max_val
733+
out[i, j] = accum[lab, j]
734+
735+
670736
@cython.boundscheck(False)
671737
@cython.wraparound(False)
672738
def group_shift_indexer(int64_t[:] out, int64_t[:] labels,

pandas/tests/groupby/test_groupby.py

+13-8
Original file line numberDiff line numberDiff line change
@@ -4977,10 +4977,6 @@ def test_groupby_whitelist(self):
49774977
'max',
49784978
'head',
49794979
'tail',
4980-
'cumsum',
4981-
'cumprod',
4982-
'cummin',
4983-
'cummax',
49844980
'cumcount',
49854981
'resample',
49864982
'describe',
@@ -5018,10 +5014,6 @@ def test_groupby_whitelist(self):
50185014
'max',
50195015
'head',
50205016
'tail',
5021-
'cumsum',
5022-
'cumprod',
5023-
'cummin',
5024-
'cummax',
50255017
'cumcount',
50265018
'resample',
50275019
'describe',
@@ -5777,6 +5769,19 @@ def test_agg_over_numpy_arrays(self):
57775769

57785770
assert_frame_equal(result, expected)
57795771

5772+
def test_cummin_cummax(self):
5773+
df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
5774+
'B': range(3, 9)})
5775+
result = df.groupby('A').cummin()
5776+
expected = pd.DataFrame({'B': [3, 3, 3, 6, 6, 6]})
5777+
tm.assert_frame_equal(result, expected)
5778+
5779+
df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
5780+
'B': range(8, 2, -1)})
5781+
result = df.groupby('A').cummax()
5782+
expected = pd.DataFrame({'B': [8, 8, 8, 5, 5, 5]})
5783+
tm.assert_frame_equal(result, expected)
5784+
57805785

57815786
def assert_fp_equal(a, b):
57825787
assert (np.abs(a - b) < 1e-12).all()

0 commit comments

Comments
 (0)