-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: Cythonize Groupby.cummin/cummax (#15048) #15053
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -568,6 +568,76 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, | |
else: | ||
out[i, j] = minx[i, j] | ||
|
||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, | ||
ndarray[{{dest_type2}}, ndim=2] values, | ||
ndarray[int64_t] labels, | ||
ndarray[{{dest_type2}}, ndim=2] accum): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so I am puzzled why we even pass in @mroeschke if you want to do a followup to remove it after would be great. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback - I had done this pattern in the By example
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ahh I see. ok, in the templated code this should be straightforward then. after seeing how easy templated code is, I much prefer to fused types :> |
||
""" | ||
Only transforms on axis=0 | ||
""" | ||
cdef: | ||
Py_ssize_t i, j, N, K, size | ||
{{dest_type2}} val, min_val | ||
int64_t lab | ||
|
||
N, K = (<object> values).shape | ||
accum.fill({{inf_val}}) | ||
|
||
with nogil: | ||
for i in range(N): | ||
lab = labels[i] | ||
|
||
if lab < 0: | ||
continue | ||
for j in range(K): | ||
val = values[i, j] | ||
if val == val: | ||
if val < accum[lab, j]: | ||
min_val = val | ||
accum[lab, j] = min_val | ||
out[i, j] = accum[lab, j] | ||
# val = nan | ||
else: | ||
out[i, j] = {{nan_val}} | ||
|
||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, | ||
ndarray[{{dest_type2}}, ndim=2] values, | ||
ndarray[int64_t] labels, | ||
ndarray[{{dest_type2}}, ndim=2] accum): | ||
""" | ||
Only transforms on axis=0 | ||
""" | ||
cdef: | ||
Py_ssize_t i, j, N, K, size | ||
{{dest_type2}} val, max_val | ||
int64_t lab | ||
|
||
N, K = (<object> values).shape | ||
accum.fill(-{{inf_val}}) | ||
|
||
with nogil: | ||
for i in range(N): | ||
lab = labels[i] | ||
|
||
if lab < 0: | ||
continue | ||
for j in range(K): | ||
val = values[i, j] | ||
if val == val: | ||
if val > accum[lab, j]: | ||
max_val = val | ||
accum[lab, j] = max_val | ||
out[i, j] = accum[lab, j] | ||
# val = nan | ||
else: | ||
out[i, j] = {{nan_val}} | ||
|
||
{{endfor}} | ||
|
||
#---------------------------------------------------------------------- | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4977,10 +4977,6 @@ def test_groupby_whitelist(self): | |
'max', | ||
'head', | ||
'tail', | ||
'cumsum', | ||
'cumprod', | ||
'cummin', | ||
'cummax', | ||
'cumcount', | ||
'resample', | ||
'describe', | ||
|
@@ -5018,10 +5014,6 @@ def test_groupby_whitelist(self): | |
'max', | ||
'head', | ||
'tail', | ||
'cumsum', | ||
'cumprod', | ||
'cummin', | ||
'cummax', | ||
'cumcount', | ||
'resample', | ||
'describe', | ||
|
@@ -5777,6 +5769,85 @@ def test_agg_over_numpy_arrays(self): | |
|
||
assert_frame_equal(result, expected) | ||
|
||
def test_cummin_cummax(self): | ||
# GH 15048 | ||
num_types = [np.int32, np.int64, np.float32, np.float64] | ||
num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min, | ||
np.finfo(np.float32).min, np.finfo(np.float64).min] | ||
num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max, | ||
np.finfo(np.float32).max, np.finfo(np.float64).max] | ||
base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2], | ||
'B': [3, 4, 3, 2, 2, 3, 2, 1]}) | ||
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] | ||
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] | ||
|
||
for dtype, min_val, max_val in zip(num_types, num_mins, num_max): | ||
df = base_df.astype(dtype) | ||
|
||
# cummin | ||
expected = pd.DataFrame({'B': expected_mins}).astype(dtype) | ||
result = df.groupby('A').cummin() | ||
tm.assert_frame_equal(result, expected) | ||
result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test cummin w/ min value for dtype | ||
df.loc[[2, 6], 'B'] = min_val | ||
expected.loc[[2, 3, 6, 7], 'B'] = min_val | ||
result = df.groupby('A').cummin() | ||
tm.assert_frame_equal(result, expected) | ||
expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# cummax | ||
expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) | ||
result = df.groupby('A').cummax() | ||
tm.assert_frame_equal(result, expected) | ||
result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test cummax w/ max value for dtype | ||
df.loc[[2, 6], 'B'] = max_val | ||
expected.loc[[2, 3, 6, 7], 'B'] = max_val | ||
result = df.groupby('A').cummax() | ||
tm.assert_frame_equal(result, expected) | ||
expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test nan in some values | ||
base_df.loc[[0, 2, 4, 6], 'B'] = np.nan | ||
expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2, | ||
np.nan, 3, np.nan, 1]}) | ||
result = base_df.groupby('A').cummin() | ||
tm.assert_frame_equal(result, expected) | ||
expected = (base_df.groupby('A') | ||
.B | ||
.apply(lambda x: x.cummin()) | ||
.to_frame()) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4, | ||
np.nan, 3, np.nan, 3]}) | ||
result = base_df.groupby('A').cummax() | ||
tm.assert_frame_equal(result, expected) | ||
expected = (base_df.groupby('A') | ||
.B | ||
.apply(lambda x: x.cummax()) | ||
.to_frame()) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also test where first element is nan. |
||
# Test nan in entire column | ||
base_df['B'] = np.nan | ||
expected = pd.DataFrame({'B': [np.nan] * 8}) | ||
result = base_df.groupby('A').cummin() | ||
tm.assert_frame_equal(expected, result) | ||
result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(expected, result) | ||
result = base_df.groupby('A').cummax() | ||
tm.assert_frame_equal(expected, result) | ||
result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(expected, result) | ||
|
||
|
||
def assert_fp_equal(a, b): | ||
assert (np.abs(a - b) < 1e-12).all() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If cummin and cummax can be removed, should cumsum and cumprod also not be removed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mean "also be removed" instead of "also not be removed"? If so, you're probably correct, I'll try remove them to see if anything breaks.