-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Support mask in GroupBy.cumsum #48070
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
0ca2233
111a55b
0fa1f3a
d9b10f1
0f09ea8
a7d918c
df4d4f6
3ee7854
ff83f13
9deceab
bcb5550
5609150
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -1070,8 +1070,9 @@ Groupby/resample/rolling | |||||
- Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`) | ||||||
- Bug in :meth:`.DataFrameGroupBy.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`) | ||||||
- Bug in :meth:`DataFrameGroupBy.cumsum` with ``skipna=False`` giving incorrect results (:issue:`46216`) | ||||||
- Bug in :meth:`GroupBy.sum` with integer dtypes losing precision (:issue:`37493`) | ||||||
- Bug in :meth:`GroupBy.sum` and :meth:`GroupBy.cumsum` with integer dtypes losing precision (:issue:`37493`) | ||||||
- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`46216`) | ||||||
- Bug in :meth:`GroupBy.cumsum` with integer dtypes causing overflows when sum was bigger than maximum of dtype (:issue:`37493`) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
- Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`) | ||||||
- Bug in :meth:`DataFrame.groupby` raising error when ``None`` is in first level of :class:`MultiIndex` (:issue:`47348`) | ||||||
- Bug in :meth:`.GroupBy.cummax` with ``int64`` dtype with leading value being the smallest possible int64 (:issue:`46382`) | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -207,15 +207,24 @@ def group_cumprod_float64( | |
break | ||
|
||
|
||
ctypedef fused cumsum_t: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the point of switching to this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keeping precision for Int dtypes with missing values. Currently this is cast to float, which loses the value for large integers also should help performance for ea arrays There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, this overflow pretty easily right now with int8 dtypes etc, this is also fixed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be misreading but this is just a subset of the types that were previously used in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are two things here:
I don't want to create specific dtypes per function. I want to keep the mask support for every function in separate prs. When more and more get merged, I will be able to combine the types and then we will be able to use these types for more functions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed the type to make it clear that I intend to reuse it |
||
int64_t | ||
uint64_t | ||
float32_t | ||
float64_t | ||
|
||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def group_cumsum( | ||
numeric_t[:, ::1] out, | ||
ndarray[numeric_t, ndim=2] values, | ||
cumsum_t[:, ::1] out, | ||
ndarray[cumsum_t, ndim=2] values, | ||
const intp_t[::1] labels, | ||
int ngroups, | ||
bint is_datetimelike, | ||
bint skipna=True, | ||
const uint8_t[:, :] mask=None, | ||
uint8_t[:, ::1] result_mask=None, | ||
) -> None: | ||
""" | ||
Cumulative sum of columns of `values`, in row groups `labels`. | ||
|
@@ -234,23 +243,33 @@ def group_cumsum( | |
True if `values` contains datetime-like entries. | ||
skipna : bool | ||
If true, ignore nans in `values`. | ||
mask: np.ndarray[uint8], optional | ||
Mask of values | ||
result_mask: np.ndarray[int8], optional | ||
Mask of out array | ||
|
||
Notes | ||
----- | ||
This method modifies the `out` parameter, rather than returning an object. | ||
""" | ||
cdef: | ||
Py_ssize_t i, j, N, K, size | ||
numeric_t val, y, t, na_val | ||
numeric_t[:, ::1] accum, compensation | ||
cumsum_t val, y, t, na_val | ||
cumsum_t[:, ::1] accum, compensation | ||
uint8_t[:, ::1] accum_mask | ||
intp_t lab | ||
bint isna_entry, isna_prev = False | ||
bint uses_mask = mask is not None | ||
|
||
N, K = (<object>values).shape | ||
|
||
if uses_mask: | ||
accum_mask = np.zeros((ngroups, K), dtype="uint8") | ||
|
||
accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) | ||
compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) | ||
|
||
na_val = _get_na_val(<numeric_t>0, is_datetimelike) | ||
na_val = _get_na_val(<cumsum_t>0, is_datetimelike) | ||
|
||
with nogil: | ||
for i in range(N): | ||
|
@@ -261,23 +280,41 @@ def group_cumsum( | |
for j in range(K): | ||
val = values[i, j] | ||
|
||
isna_entry = _treat_as_na(val, is_datetimelike) | ||
if uses_mask: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than continue There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried that, but imo that reduces readability. The uses_mask is a simple if_else branch, if we move this outside, it is hard to see that the actual logic of the algorithm is the same in both branches (and we have to keep it consistent over time). |
||
isna_entry = mask[i, j] | ||
else: | ||
isna_entry = _treat_as_na(val, is_datetimelike) | ||
|
||
if not skipna: | ||
isna_prev = _treat_as_na(accum[lab, j], is_datetimelike) | ||
if uses_mask: | ||
isna_prev = accum_mask[lab, j] | ||
else: | ||
isna_prev = _treat_as_na(accum[lab, j], is_datetimelike) | ||
|
||
if isna_prev: | ||
out[i, j] = na_val | ||
if uses_mask: | ||
result_mask[i, j] = True | ||
else: | ||
out[i, j] = na_val | ||
continue | ||
|
||
if isna_entry: | ||
out[i, j] = na_val | ||
|
||
if uses_mask: | ||
result_mask[i, j] = True | ||
else: | ||
out[i, j] = na_val | ||
|
||
if not skipna: | ||
accum[lab, j] = na_val | ||
if uses_mask: | ||
accum_mask[lab, j] = True | ||
else: | ||
accum[lab, j] = na_val | ||
|
||
else: | ||
# For floats, use Kahan summation to reduce floating-point | ||
# error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm) | ||
if numeric_t == float32_t or numeric_t == float64_t: | ||
if cumsum_t == float32_t or cumsum_t == float64_t: | ||
y = val - compensation[lab, j] | ||
t = accum[lab, j] + y | ||
compensation[lab, j] = t - accum[lab, j] - y | ||
|
@@ -1041,7 +1078,7 @@ cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike): | |
elif numeric_t is int64_t and is_datetimelike: | ||
na_val = NPY_NAT | ||
else: | ||
# Will not be used, but define to avoid uninitialized warning. | ||
# Used in case of masks | ||
na_val = 0 | ||
return na_val | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(the leading dot is kind of a wildcard * so that sphinx looks in the full nested pandas namespace for a match (and not only in the top-level namespace), to avoid writing it out as pandas.core.groupby.GroupBy.cumsum)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thx, I've also changed the references from the other pro that were already merged