Skip to content

Commit 0fe491d

Browse files
mroeschkejreback
authored andcommitted
PERF: Cythonize Groupby.cummin/cummax (#15048)
closes #15048 Author: Matt Roeschke <[email protected]> Closes #15053 from mroeschke/improve_cummin_cummax and squashes the following commits: 5e8ba63 [Matt Roeschke] PERF: Cythonize Groupby.cummin/cummax (#15048)
1 parent b8e7c34 commit 0fe491d

File tree

5 files changed

+173
-14
lines changed

5 files changed

+173
-14
lines changed

asv_bench/benchmarks/groupby.py

-3
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,3 @@ def time_shift(self):
690690
def time_transform_dataframe(self):
691691
# GH 12737
692692
self.df_nans.groupby('key').transform('first')
693-
694-
695-

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ Performance Improvements
283283
- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
284284
- Improved performance of timeseries plotting with an irregular DatetimeIndex
285285
(or with ``compat_x=True``) (:issue:`15073`).
286-
286+
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
287287

288288
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
289289

pandas/core/groupby.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
'last', 'first',
7676
'head', 'tail', 'median',
7777
'mean', 'sum', 'min', 'max',
78-
'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
78+
'cumcount',
7979
'resample',
8080
'describe',
8181
'rank', 'quantile',
@@ -97,7 +97,8 @@
9797
_dataframe_apply_whitelist = \
9898
_common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
9999

100-
_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift'])
100+
_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift',
101+
'cummin', 'cummax'])
101102

102103

103104
def _groupby_function(name, alias, npfunc, numeric_only=True,
@@ -1415,6 +1416,24 @@ def cumsum(self, axis=0, *args, **kwargs):
14151416

14161417
return self._cython_transform('cumsum')
14171418

1419+
@Substitution(name='groupby')
1420+
@Appender(_doc_template)
1421+
def cummin(self, axis=0):
1422+
"""Cumulative min for each group"""
1423+
if axis != 0:
1424+
return self.apply(lambda x: np.minimum.accumulate(x, axis))
1425+
1426+
return self._cython_transform('cummin')
1427+
1428+
@Substitution(name='groupby')
1429+
@Appender(_doc_template)
1430+
def cummax(self, axis=0):
1431+
"""Cumulative max for each group"""
1432+
if axis != 0:
1433+
return self.apply(lambda x: np.maximum.accumulate(x, axis))
1434+
1435+
return self._cython_transform('cummax')
1436+
14181437
@Substitution(name='groupby')
14191438
@Appender(_doc_template)
14201439
def shift(self, periods=1, freq=None, axis=0):
@@ -1752,6 +1771,8 @@ def get_group_levels(self):
17521771
'transform': {
17531772
'cumprod': 'group_cumprod',
17541773
'cumsum': 'group_cumsum',
1774+
'cummin': 'group_cummin',
1775+
'cummax': 'group_cummax',
17551776
}
17561777
}
17571778

pandas/src/algos_groupby_helper.pxi.in

+70
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,76 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
568568
else:
569569
out[i, j] = minx[i, j]
570570

571+
572+
@cython.boundscheck(False)
573+
@cython.wraparound(False)
574+
def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
575+
ndarray[{{dest_type2}}, ndim=2] values,
576+
ndarray[int64_t] labels,
577+
ndarray[{{dest_type2}}, ndim=2] accum):
578+
"""
579+
Only transforms on axis=0
580+
"""
581+
cdef:
582+
Py_ssize_t i, j, N, K, size
583+
{{dest_type2}} val, min_val
584+
int64_t lab
585+
586+
N, K = (<object> values).shape
587+
accum.fill({{inf_val}})
588+
589+
with nogil:
590+
for i in range(N):
591+
lab = labels[i]
592+
593+
if lab < 0:
594+
continue
595+
for j in range(K):
596+
val = values[i, j]
597+
if val == val:
598+
if val < accum[lab, j]:
599+
min_val = val
600+
accum[lab, j] = min_val
601+
out[i, j] = accum[lab, j]
602+
# val = nan
603+
else:
604+
out[i, j] = {{nan_val}}
605+
606+
607+
@cython.boundscheck(False)
608+
@cython.wraparound(False)
609+
def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
610+
ndarray[{{dest_type2}}, ndim=2] values,
611+
ndarray[int64_t] labels,
612+
ndarray[{{dest_type2}}, ndim=2] accum):
613+
"""
614+
Only transforms on axis=0
615+
"""
616+
cdef:
617+
Py_ssize_t i, j, N, K, size
618+
{{dest_type2}} val, max_val
619+
int64_t lab
620+
621+
N, K = (<object> values).shape
622+
accum.fill(-{{inf_val}})
623+
624+
with nogil:
625+
for i in range(N):
626+
lab = labels[i]
627+
628+
if lab < 0:
629+
continue
630+
for j in range(K):
631+
val = values[i, j]
632+
if val == val:
633+
if val > accum[lab, j]:
634+
max_val = val
635+
accum[lab, j] = max_val
636+
out[i, j] = accum[lab, j]
637+
# val = nan
638+
else:
639+
out[i, j] = {{nan_val}}
640+
571641
{{endfor}}
572642

573643
#----------------------------------------------------------------------

pandas/tests/groupby/test_groupby.py

+79-8
Original file line numberDiff line numberDiff line change
@@ -4977,10 +4977,6 @@ def test_groupby_whitelist(self):
49774977
'max',
49784978
'head',
49794979
'tail',
4980-
'cumsum',
4981-
'cumprod',
4982-
'cummin',
4983-
'cummax',
49844980
'cumcount',
49854981
'resample',
49864982
'describe',
@@ -5018,10 +5014,6 @@ def test_groupby_whitelist(self):
50185014
'max',
50195015
'head',
50205016
'tail',
5021-
'cumsum',
5022-
'cumprod',
5023-
'cummin',
5024-
'cummax',
50255017
'cumcount',
50265018
'resample',
50275019
'describe',
@@ -5777,6 +5769,85 @@ def test_agg_over_numpy_arrays(self):
57775769

57785770
assert_frame_equal(result, expected)
57795771

5772+
def test_cummin_cummax(self):
5773+
# GH 15048
5774+
num_types = [np.int32, np.int64, np.float32, np.float64]
5775+
num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min,
5776+
np.finfo(np.float32).min, np.finfo(np.float64).min]
5777+
num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max,
5778+
np.finfo(np.float32).max, np.finfo(np.float64).max]
5779+
base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2],
5780+
'B': [3, 4, 3, 2, 2, 3, 2, 1]})
5781+
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
5782+
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
5783+
5784+
for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
5785+
df = base_df.astype(dtype)
5786+
5787+
# cummin
5788+
expected = pd.DataFrame({'B': expected_mins}).astype(dtype)
5789+
result = df.groupby('A').cummin()
5790+
tm.assert_frame_equal(result, expected)
5791+
result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
5792+
tm.assert_frame_equal(result, expected)
5793+
5794+
# Test cummin w/ min value for dtype
5795+
df.loc[[2, 6], 'B'] = min_val
5796+
expected.loc[[2, 3, 6, 7], 'B'] = min_val
5797+
result = df.groupby('A').cummin()
5798+
tm.assert_frame_equal(result, expected)
5799+
expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
5800+
tm.assert_frame_equal(result, expected)
5801+
5802+
# cummax
5803+
expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
5804+
result = df.groupby('A').cummax()
5805+
tm.assert_frame_equal(result, expected)
5806+
result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
5807+
tm.assert_frame_equal(result, expected)
5808+
5809+
# Test cummax w/ max value for dtype
5810+
df.loc[[2, 6], 'B'] = max_val
5811+
expected.loc[[2, 3, 6, 7], 'B'] = max_val
5812+
result = df.groupby('A').cummax()
5813+
tm.assert_frame_equal(result, expected)
5814+
expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
5815+
tm.assert_frame_equal(result, expected)
5816+
5817+
# Test nan in some values
5818+
base_df.loc[[0, 2, 4, 6], 'B'] = np.nan
5819+
expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2,
5820+
np.nan, 3, np.nan, 1]})
5821+
result = base_df.groupby('A').cummin()
5822+
tm.assert_frame_equal(result, expected)
5823+
expected = (base_df.groupby('A')
5824+
.B
5825+
.apply(lambda x: x.cummin())
5826+
.to_frame())
5827+
tm.assert_frame_equal(result, expected)
5828+
5829+
expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4,
5830+
np.nan, 3, np.nan, 3]})
5831+
result = base_df.groupby('A').cummax()
5832+
tm.assert_frame_equal(result, expected)
5833+
expected = (base_df.groupby('A')
5834+
.B
5835+
.apply(lambda x: x.cummax())
5836+
.to_frame())
5837+
tm.assert_frame_equal(result, expected)
5838+
5839+
# Test nan in entire column
5840+
base_df['B'] = np.nan
5841+
expected = pd.DataFrame({'B': [np.nan] * 8})
5842+
result = base_df.groupby('A').cummin()
5843+
tm.assert_frame_equal(expected, result)
5844+
result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
5845+
tm.assert_frame_equal(expected, result)
5846+
result = base_df.groupby('A').cummax()
5847+
tm.assert_frame_equal(expected, result)
5848+
result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
5849+
tm.assert_frame_equal(expected, result)
5850+
57805851

57815852
def assert_fp_equal(a, b):
57825853
assert (np.abs(a - b) < 1e-12).all()

0 commit comments

Comments
 (0)