Skip to content

Commit e811514

Browse files
committed
PERF: Cythonize Groupby.cummin/cummax (#15048)
1 parent 6eb705f commit e811514

File tree

4 files changed

+112
-9
lines changed

4 files changed

+112
-9
lines changed

asv_bench/benchmarks/groupby.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,26 @@ def time_groupby_dt_timegrouper_size(self):
344344
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
345345

346346

347+
#----------------------------------------------------------------------
348+
# groupby.cummin and groupby.cummax # GH 15048
349+
350+
class groupby_cummax_cummin(object):
351+
goal_time = 0.2
352+
353+
def setup(self):
354+
np.random.seed(1234)
355+
self.G = 1000
356+
self.N = 10000
357+
self.df = pd.DataFrame({'A': np.random.randint(0, self.G, size=self.N),
358+
'B': np.random.randn(self.N)})
359+
360+
def time_cummin(self):
361+
self.df.groupby('A').cummin()
362+
363+
def time_cummax(self):
364+
self.df.groupby('A').cummax()
365+
366+
347367
#----------------------------------------------------------------------
348368
# groupby with a variable value for ngroups
349369

@@ -690,6 +710,3 @@ def time_shift(self):
690710
def time_transform_dataframe(self):
691711
# GH 12737
692712
self.df_nans.groupby('key').transform('first')
693-
694-
695-

pandas/core/groupby.py

+25-2
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
'last', 'first',
7676
'head', 'tail', 'median',
7777
'mean', 'sum', 'min', 'max',
78-
'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
78+
'cumsum', 'cumprod', 'cumcount',
7979
'resample',
8080
'describe',
8181
'rank', 'quantile',
@@ -97,7 +97,8 @@
9797
_dataframe_apply_whitelist = \
9898
_common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
9999

100-
_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift'])
100+
_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift',
101+
'cummin', 'cummax'])
101102

102103

103104
def _groupby_function(name, alias, npfunc, numeric_only=True,
@@ -1415,6 +1416,26 @@ def cumsum(self, axis=0, *args, **kwargs):
14151416

14161417
return self._cython_transform('cumsum')
14171418

1419+
@Substitution(name='groupby')
1420+
@Appender(_doc_template)
1421+
def cummin(self, axis=0, *args, **kwargs):
1422+
"""Cumulative min for each group"""
1423+
nv.validate_groupby_func('cummin', args, kwargs)
1424+
if axis != 0:
1425+
return self.apply(lambda x: np.minimum.accumulate(x, axis))
1426+
1427+
return self._cython_transform('cummin')
1428+
1429+
@Substitution(name='groupby')
1430+
@Appender(_doc_template)
1431+
def cummax(self, axis=0, *args, **kwargs):
1432+
"""Cumulative max for each group"""
1433+
nv.validate_groupby_func('cummax', args, kwargs)
1434+
if axis != 0:
1435+
return self.apply(lambda x: np.maximum.accumulate(x, axis))
1436+
1437+
return self._cython_transform('cummax')
1438+
14181439
@Substitution(name='groupby')
14191440
@Appender(_doc_template)
14201441
def shift(self, periods=1, freq=None, axis=0):
@@ -1752,6 +1773,8 @@ def get_group_levels(self):
17521773
'transform': {
17531774
'cumprod': 'group_cumprod',
17541775
'cumsum': 'group_cumsum',
1776+
'cummin': 'group_cummin',
1777+
'cummax': 'group_cummax',
17551778
}
17561779
}
17571780

pandas/src/algos_groupby_helper.pxi.in

+67
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,73 @@ def group_cumsum(numeric[:, :] out,
667667
out[i, j] = accum[lab, j]
668668

669669

670+
@cython.boundscheck(False)
671+
@cython.wraparound(False)
672+
def group_cummin(numeric[:, :] out,
673+
numeric[:, :] values,
674+
int64_t[:] labels,
675+
numeric[:, :] accum):
676+
"""
677+
Only transforms on axis=0
678+
"""
679+
cdef:
680+
Py_ssize_t i, j, N, K, size
681+
numeric val, min_val
682+
int64_t lab
683+
684+
N, K = (<object> values).shape
685+
accum = np.zeros_like(accum)
686+
687+
with nogil:
688+
for i in range(N):
689+
lab = labels[i]
690+
691+
if lab < 0:
692+
continue
693+
for j in range(K):
694+
val = values[i, j]
695+
if val == val:
696+
if j == 0:
697+
min_val = val
698+
elif val < min_val:
699+
min_val = val
700+
accum[lab, j] = min_val
701+
out[i, j] = accum[lab, j]
702+
703+
@cython.boundscheck(False)
704+
@cython.wraparound(False)
705+
def group_cummax(numeric[:, :] out,
706+
numeric[:, :] values,
707+
int64_t[:] labels,
708+
numeric[:, :] accum):
709+
"""
710+
Only transforms on axis=0
711+
"""
712+
cdef:
713+
Py_ssize_t i, j, N, K, size
714+
numeric val, max_val
715+
int64_t lab
716+
717+
N, K = (<object> values).shape
718+
accum = np.zeros_like(accum)
719+
720+
with nogil:
721+
for i in range(N):
722+
lab = labels[i]
723+
724+
if lab < 0:
725+
continue
726+
for j in range(K):
727+
val = values[i, j]
728+
if val == val:
729+
if j == 0:
730+
max_val = val
731+
elif val > max_val:
732+
max_val = val
733+
accum[lab, j] = max_val
734+
out[i, j] = accum[lab, j]
735+
736+
670737
@cython.boundscheck(False)
671738
@cython.wraparound(False)
672739
def group_shift_indexer(int64_t[:] out, int64_t[:] labels,

pandas/tests/groupby/test_groupby.py

-4
Original file line numberDiff line numberDiff line change
@@ -4979,8 +4979,6 @@ def test_groupby_whitelist(self):
49794979
'tail',
49804980
'cumsum',
49814981
'cumprod',
4982-
'cummin',
4983-
'cummax',
49844982
'cumcount',
49854983
'resample',
49864984
'describe',
@@ -5020,8 +5018,6 @@ def test_groupby_whitelist(self):
50205018
'tail',
50215019
'cumsum',
50225020
'cumprod',
5023-
'cummin',
5024-
'cummax',
50255021
'cumcount',
50265022
'resample',
50275023
'describe',

0 commit comments

Comments
 (0)