Skip to content

PERF: Cythonize Groupby.cummin/cummax (#15048) #15053

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,3 @@ def time_shift(self):
def time_transform_dataframe(self):
# GH 12737
self.df_nans.groupby('key').transform('first')



2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ Performance Improvements
- Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
- Improved performance of timeseries plotting with an irregular DatetimeIndex
(or with ``compat_x=True``) (:issue:`15073`).

- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)

- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.

Expand Down
25 changes: 23 additions & 2 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
'last', 'first',
'head', 'tail', 'median',
'mean', 'sum', 'min', 'max',
'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
'cumcount',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If cummin and cummax can be removed, should cumsum and cumprod also not be removed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean "also be removed" instead of "also not be removed"? If so, you're probably correct, I'll try remove them to see if anything breaks.

'resample',
'describe',
'rank', 'quantile',
Expand All @@ -97,7 +97,8 @@
_dataframe_apply_whitelist = \
_common_apply_whitelist | frozenset(['dtypes', 'corrwith'])

_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift'])
_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift',
'cummin', 'cummax'])


def _groupby_function(name, alias, npfunc, numeric_only=True,
Expand Down Expand Up @@ -1415,6 +1416,24 @@ def cumsum(self, axis=0, *args, **kwargs):

return self._cython_transform('cumsum')

@Substitution(name='groupby')
@Appender(_doc_template)
def cummin(self, axis=0):
"""Cumulative min for each group"""
if axis != 0:
return self.apply(lambda x: np.minimum.accumulate(x, axis))

return self._cython_transform('cummin')

@Substitution(name='groupby')
@Appender(_doc_template)
def cummax(self, axis=0):
"""Cumulative max for each group"""
if axis != 0:
return self.apply(lambda x: np.maximum.accumulate(x, axis))

return self._cython_transform('cummax')

@Substitution(name='groupby')
@Appender(_doc_template)
def shift(self, periods=1, freq=None, axis=0):
Expand Down Expand Up @@ -1752,6 +1771,8 @@ def get_group_levels(self):
'transform': {
'cumprod': 'group_cumprod',
'cumsum': 'group_cumsum',
'cummin': 'group_cummin',
'cummax': 'group_cummax',
}
}

Expand Down
70 changes: 70 additions & 0 deletions pandas/src/algos_groupby_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,76 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
else:
out[i, j] = minx[i, j]


@cython.boundscheck(False)
@cython.wraparound(False)
def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels,
ndarray[{{dest_type2}}, ndim=2] accum):
Copy link
Contributor

@jreback jreback Jan 11, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I am puzzled why we even pass in accum here at all (rather than just allocate it here). It is only used internally. This would affect the groupby code and cumsum, cumprod as well as these new funcs.

@mroeschke if you want to do a followup to remove it after would be great.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback - I had done this pattern in the cumsum code. IIRC, it was because I was using fused types / memorviews, and I didn't see a way to do the allocation. It would be straightforward to avoid in the templated code.

By example

def func(numeric[:] arr, int a, int b):
    # how would you allocate an (a x b) array of type `numeric?`

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh I see. ok, in the templated code this should be straightforward then. after seeing how easy templated code is, I much prefer to fused types :>

"""
Only transforms on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, size
{{dest_type2}} val, min_val
int64_t lab

N, K = (<object> values).shape
accum.fill({{inf_val}})

with nogil:
for i in range(N):
lab = labels[i]

if lab < 0:
continue
for j in range(K):
val = values[i, j]
if val == val:
if val < accum[lab, j]:
min_val = val
accum[lab, j] = min_val
out[i, j] = accum[lab, j]
# val = nan
else:
out[i, j] = {{nan_val}}


@cython.boundscheck(False)
@cython.wraparound(False)
def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
ndarray[{{dest_type2}}, ndim=2] values,
ndarray[int64_t] labels,
ndarray[{{dest_type2}}, ndim=2] accum):
"""
Only transforms on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, size
{{dest_type2}} val, max_val
int64_t lab

N, K = (<object> values).shape
accum.fill(-{{inf_val}})

with nogil:
for i in range(N):
lab = labels[i]

if lab < 0:
continue
for j in range(K):
val = values[i, j]
if val == val:
if val > accum[lab, j]:
max_val = val
accum[lab, j] = max_val
out[i, j] = accum[lab, j]
# val = nan
else:
out[i, j] = {{nan_val}}

{{endfor}}

#----------------------------------------------------------------------
Expand Down
87 changes: 79 additions & 8 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4977,10 +4977,6 @@ def test_groupby_whitelist(self):
'max',
'head',
'tail',
'cumsum',
'cumprod',
'cummin',
'cummax',
'cumcount',
'resample',
'describe',
Expand Down Expand Up @@ -5018,10 +5014,6 @@ def test_groupby_whitelist(self):
'max',
'head',
'tail',
'cumsum',
'cumprod',
'cummin',
'cummax',
'cumcount',
'resample',
'describe',
Expand Down Expand Up @@ -5777,6 +5769,85 @@ def test_agg_over_numpy_arrays(self):

assert_frame_equal(result, expected)

def test_cummin_cummax(self):
# GH 15048
num_types = [np.int32, np.int64, np.float32, np.float64]
num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min,
np.finfo(np.float32).min, np.finfo(np.float64).min]
num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max,
np.finfo(np.float32).max, np.finfo(np.float64).max]
base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2],
'B': [3, 4, 3, 2, 2, 3, 2, 1]})
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
df = base_df.astype(dtype)

# cummin
expected = pd.DataFrame({'B': expected_mins}).astype(dtype)
result = df.groupby('A').cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test cummin w/ min value for dtype
df.loc[[2, 6], 'B'] = min_val
expected.loc[[2, 3, 6, 7], 'B'] = min_val
result = df.groupby('A').cummin()
tm.assert_frame_equal(result, expected)
expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# cummax
expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
result = df.groupby('A').cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test cummax w/ max value for dtype
df.loc[[2, 6], 'B'] = max_val
expected.loc[[2, 3, 6, 7], 'B'] = max_val
result = df.groupby('A').cummax()
tm.assert_frame_equal(result, expected)
expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test nan in some values
base_df.loc[[0, 2, 4, 6], 'B'] = np.nan
expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2,
np.nan, 3, np.nan, 1]})
result = base_df.groupby('A').cummin()
tm.assert_frame_equal(result, expected)
expected = (base_df.groupby('A')
.B
.apply(lambda x: x.cummin())
.to_frame())
tm.assert_frame_equal(result, expected)

expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4,
np.nan, 3, np.nan, 3]})
result = base_df.groupby('A').cummax()
tm.assert_frame_equal(result, expected)
expected = (base_df.groupby('A')
.B
.apply(lambda x: x.cummax())
.to_frame())
tm.assert_frame_equal(result, expected)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also test where first element is nan.

# Test nan in entire column
base_df['B'] = np.nan
expected = pd.DataFrame({'B': [np.nan] * 8})
result = base_df.groupby('A').cummin()
tm.assert_frame_equal(expected, result)
result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(expected, result)
result = base_df.groupby('A').cummax()
tm.assert_frame_equal(expected, result)
result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(expected, result)


def assert_fp_equal(a, b):
assert (np.abs(a - b) < 1e-12).all()
Expand Down