Skip to content

Commit 2fc025b

Browse files
committed
PERF: Cythonize Groupby.cummin/cummax (#15048)
pep8, removed args, kwargs add whatsnew + test + changed logic small error in algo Use a more obvious test Fixed algo & test passed Add dtypes test Add additional tests handle nan case Adapt max/min for different dtypes + tests remove uncessary comments
1 parent b895968 commit 2fc025b

File tree

5 files changed

+171
-14
lines changed

5 files changed

+171
-14
lines changed

asv_bench/benchmarks/groupby.py

-3
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,3 @@ def time_shift(self):
690690
def time_transform_dataframe(self):
691691
# GH 12737
692692
self.df_nans.groupby('key').transform('first')
693-
694-
695-

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ Performance Improvements
281281

282282
- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
283283
- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
284-
284+
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
285285

286286

287287
.. _whatsnew_0200.bug_fixes:

pandas/core/groupby.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
'last', 'first',
7676
'head', 'tail', 'median',
7777
'mean', 'sum', 'min', 'max',
78-
'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
78+
'cumcount',
7979
'resample',
8080
'describe',
8181
'rank', 'quantile',
@@ -97,7 +97,8 @@
9797
_dataframe_apply_whitelist = \
9898
_common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
9999

100-
_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift'])
100+
_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift',
101+
'cummin', 'cummax'])
101102

102103

103104
def _groupby_function(name, alias, npfunc, numeric_only=True,
@@ -1415,6 +1416,24 @@ def cumsum(self, axis=0, *args, **kwargs):
14151416

14161417
return self._cython_transform('cumsum')
14171418

1419+
@Substitution(name='groupby')
1420+
@Appender(_doc_template)
1421+
def cummin(self, axis=0):
1422+
"""Cumulative min for each group"""
1423+
if axis != 0:
1424+
return self.apply(lambda x: np.minimum.accumulate(x, axis))
1425+
1426+
return self._cython_transform('cummin')
1427+
1428+
@Substitution(name='groupby')
1429+
@Appender(_doc_template)
1430+
def cummax(self, axis=0):
1431+
"""Cumulative max for each group"""
1432+
if axis != 0:
1433+
return self.apply(lambda x: np.maximum.accumulate(x, axis))
1434+
1435+
return self._cython_transform('cummax')
1436+
14181437
@Substitution(name='groupby')
14191438
@Appender(_doc_template)
14201439
def shift(self, periods=1, freq=None, axis=0):
@@ -1752,6 +1771,8 @@ def get_group_levels(self):
17521771
'transform': {
17531772
'cumprod': 'group_cumprod',
17541773
'cumsum': 'group_cumsum',
1774+
'cummin': 'group_cummin',
1775+
'cummax': 'group_cummax',
17551776
}
17561777
}
17571778

pandas/src/algos_groupby_helper.pxi.in

+72
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,78 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
568568
else:
569569
out[i, j] = minx[i, j]
570570

571+
572+
@cython.boundscheck(False)
573+
@cython.wraparound(False)
574+
def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
575+
ndarray[{{dest_type2}}, ndim=2] values,
576+
ndarray[int64_t] labels,
577+
ndarray[{{dest_type2}}, ndim=2] accum):
578+
"""
579+
Only transforms on axis=0
580+
"""
581+
cdef:
582+
Py_ssize_t i, j, N, K, size
583+
{{dest_type2}} val, min_val
584+
int64_t lab
585+
586+
N, K = (<object> values).shape
587+
accum = np.empty_like(accum)
588+
accum.fill({{inf_val}})
589+
590+
with nogil:
591+
for i in range(N):
592+
lab = labels[i]
593+
594+
if lab < 0:
595+
continue
596+
for j in range(K):
597+
val = values[i, j]
598+
if val == val:
599+
if val < accum[lab, j]:
600+
min_val = val
601+
accum[lab, j] = min_val
602+
out[i, j] = accum[lab, j]
603+
# val = nan
604+
elif val != val:
605+
out[i, j] = {{nan_val}}
606+
607+
608+
@cython.boundscheck(False)
609+
@cython.wraparound(False)
610+
def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
611+
ndarray[{{dest_type2}}, ndim=2] values,
612+
ndarray[int64_t] labels,
613+
ndarray[{{dest_type2}}, ndim=2] accum):
614+
"""
615+
Only transforms on axis=0
616+
"""
617+
cdef:
618+
Py_ssize_t i, j, N, K, size
619+
{{dest_type2}} val, max_val
620+
int64_t lab
621+
622+
N, K = (<object> values).shape
623+
accum = np.empty_like(accum)
624+
accum.fill(-{{inf_val}})
625+
626+
with nogil:
627+
for i in range(N):
628+
lab = labels[i]
629+
630+
if lab < 0:
631+
continue
632+
for j in range(K):
633+
val = values[i, j]
634+
if val == val:
635+
if val > accum[lab, j]:
636+
max_val = val
637+
accum[lab, j] = max_val
638+
out[i, j] = accum[lab, j]
639+
# val = nan
640+
elif val != val:
641+
out[i, j] = {{nan_val}}
642+
571643
{{endfor}}
572644

573645
#----------------------------------------------------------------------

pandas/tests/groupby/test_groupby.py

+75-8
Original file line numberDiff line numberDiff line change
@@ -4977,10 +4977,6 @@ def test_groupby_whitelist(self):
49774977
'max',
49784978
'head',
49794979
'tail',
4980-
'cumsum',
4981-
'cumprod',
4982-
'cummin',
4983-
'cummax',
49844980
'cumcount',
49854981
'resample',
49864982
'describe',
@@ -5018,10 +5014,6 @@ def test_groupby_whitelist(self):
50185014
'max',
50195015
'head',
50205016
'tail',
5021-
'cumsum',
5022-
'cumprod',
5023-
'cummin',
5024-
'cummax',
50255017
'cumcount',
50265018
'resample',
50275019
'describe',
@@ -5777,6 +5769,81 @@ def test_agg_over_numpy_arrays(self):
57775769

57785770
assert_frame_equal(result, expected)
57795771

5772+
def test_cummin_cummax(self):
5773+
# GH 15048
5774+
# Test with different dtypes
5775+
for dtype in [np.int32, np.float32, np.float64]:
5776+
df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
5777+
'B': range(3, 9)}).astype(dtype)
5778+
result = df.groupby('A').cummin()
5779+
expected = pd.DataFrame({'B': [3, 3, 3, 6, 6, 6]}).astype(dtype)
5780+
tm.assert_frame_equal(result, expected)
5781+
expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
5782+
tm.assert_frame_equal(result, expected)
5783+
# Test min value for dtype
5784+
try:
5785+
min_val = np.iinfo(dtype).min
5786+
except ValueError:
5787+
min_val = np.finfo(dtype).min
5788+
df.loc[[1, 4], 'B'] = min_val
5789+
result = df.groupby('A').cummin()
5790+
expected = pd.DataFrame({'B': [3, min_val, min_val,
5791+
6, min_val, min_val]}).astype(dtype)
5792+
tm.assert_frame_equal(result, expected)
5793+
expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
5794+
tm.assert_frame_equal(result, expected)
5795+
5796+
df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
5797+
'B': range(8, 2, -1)}).astype(dtype)
5798+
result = df.groupby('A').cummax()
5799+
expected = pd.DataFrame({'B': [8, 8, 8, 5, 5, 5]}).astype(dtype)
5800+
tm.assert_frame_equal(result, expected)
5801+
expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
5802+
tm.assert_frame_equal(result, expected)
5803+
# Test max value for dtype
5804+
try:
5805+
max_val = np.iinfo(dtype).max
5806+
except ValueError:
5807+
max_val = np.finfo(dtype).max
5808+
df.loc[[1, 4], 'B'] = max_val
5809+
result = df.groupby('A').cummax()
5810+
expected = pd.DataFrame({'B': [8, max_val, max_val,
5811+
5, max_val, max_val]}).astype(dtype)
5812+
tm.assert_frame_equal(result, expected)
5813+
expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
5814+
tm.assert_frame_equal(result, expected)
5815+
5816+
# Some entries in column are nan
5817+
df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
5818+
'B': range(3, 9)})
5819+
df.loc[[1, 4], 'B'] = np.nan
5820+
result = df.groupby('A').cummin()
5821+
expected = pd.DataFrame({'B': [3, np.nan, 3, 6, np.nan, 6]})
5822+
tm.assert_frame_equal(result, expected)
5823+
expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
5824+
tm.assert_frame_equal(result, expected)
5825+
5826+
df = pd.DataFrame({'A': [1, 1, 1, 2, 2, 2],
5827+
'B': range(8, 2, -1)})
5828+
df.loc[[1, 4], 'B'] = np.nan
5829+
result = df.groupby('A').cummax()
5830+
expected = pd.DataFrame({'B': [8, np.nan, 8, 5, np.nan, 5]})
5831+
tm.assert_frame_equal(result, expected)
5832+
expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
5833+
tm.assert_frame_equal(result, expected)
5834+
5835+
# Entire column is nan
5836+
df['B'] = np.nan
5837+
expected = pd.DataFrame({'B': [np.nan] * 6})
5838+
result = df.groupby('A').cummin()
5839+
tm.assert_frame_equal(expected, result)
5840+
result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
5841+
tm.assert_frame_equal(expected, result)
5842+
result = df.groupby('A').cummax()
5843+
tm.assert_frame_equal(expected, result)
5844+
result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
5845+
tm.assert_frame_equal(expected, result)
5846+
57805847

57815848
def assert_fp_equal(a, b):
57825849
assert (np.abs(a - b) < 1e-12).all()

0 commit comments

Comments
 (0)