Skip to content

Commit f19aeaf

Browse files
authored
ENH: Support mask in groupby cumprod (#48138)
* ENH: Support mask in groupby cumprod * Add whatsnew * Move whatsnew * Adress review * Fix example * Clarify * Change dtype access
1 parent 1273bc9 commit f19aeaf

File tree

7 files changed

+103
-27
lines changed

7 files changed

+103
-27
lines changed

doc/source/whatsnew/v1.6.0.rst

+28-3
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,33 @@ These are bug fixes that might have notable behavior changes.
4545

4646
.. _whatsnew_160.notable_bug_fixes.notable_bug_fix1:
4747

48-
notable_bug_fix1
49-
^^^^^^^^^^^^^^^^
48+
:meth:`.GroupBy.cumsum` and :meth:`.GroupBy.cumprod` overflow instead of lossy casting to float
49+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
50+
51+
In previous versions we cast to float when applying ``cumsum`` and ``cumprod`` which
52+
lead to incorrect results even if the result could be hold by ``int64`` dtype.
53+
Additionally, the aggregation overflows consistent with numpy and the regular
54+
:meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods when the limit of
55+
``int64`` is reached (:issue:`37493`).
56+
57+
*Old Behavior*
58+
59+
.. code-block:: ipython
60+
61+
In [1]: df = pd.DataFrame({"key": ["b"] * 7, "value": 625})
62+
In [2]: df.groupby("key")["value"].cumprod()[5]
63+
Out[2]: 5.960464477539062e+16
64+
65+
We return incorrect results with the 6th value.
66+
67+
*New Behavior*
68+
69+
.. ipython:: python
70+
71+
df = pd.DataFrame({"key": ["b"] * 7, "value": 625})
72+
df.groupby("key")["value"].cumprod()
73+
74+
We overflow with the 7th value, but the 6th value is still correct.
5075

5176
.. _whatsnew_160.notable_bug_fixes.notable_bug_fix2:
5277

@@ -104,7 +129,7 @@ Deprecations
104129

105130
Performance improvements
106131
~~~~~~~~~~~~~~~~~~~~~~~~
107-
- Performance improvement in :meth:`.DataFrameGroupBy.median` and :meth:`.SeriesGroupBy.median` for nullable dtypes (:issue:`37493`)
132+
- Performance improvement in :meth:`.DataFrameGroupBy.median` and :meth:`.SeriesGroupBy.median` and :meth:`.GroupBy.cumprod` for nullable dtypes (:issue:`37493`)
108133
- Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
109134
- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`)
110135
- Performance improvement in :meth:`.DataFrameGroupBy.mean`, :meth:`.SeriesGroupBy.mean`, :meth:`.DataFrameGroupBy.var`, and :meth:`.SeriesGroupBy.var` for extension array dtypes (:issue:`37493`)

pandas/_libs/groupby.pyi

+3-1
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@ def group_median_float64(
1313
mask: np.ndarray | None = ...,
1414
result_mask: np.ndarray | None = ...,
1515
) -> None: ...
16-
def group_cumprod_float64(
16+
def group_cumprod(
1717
out: np.ndarray, # float64_t[:, ::1]
1818
values: np.ndarray, # const float64_t[:, :]
1919
labels: np.ndarray, # const int64_t[:]
2020
ngroups: int,
2121
is_datetimelike: bool,
2222
skipna: bool = ...,
23+
mask: np.ndarray | None = ...,
24+
result_mask: np.ndarray | None = ...,
2325
) -> None: ...
2426
def group_cumsum(
2527
out: np.ndarray, # int64float_t[:, ::1]

pandas/_libs/groupby.pyx

+44-11
Original file line numberDiff line numberDiff line change
@@ -228,13 +228,15 @@ def group_median_float64(
228228

229229
@cython.boundscheck(False)
230230
@cython.wraparound(False)
231-
def group_cumprod_float64(
232-
float64_t[:, ::1] out,
233-
const float64_t[:, :] values,
231+
def group_cumprod(
232+
int64float_t[:, ::1] out,
233+
ndarray[int64float_t, ndim=2] values,
234234
const intp_t[::1] labels,
235235
int ngroups,
236236
bint is_datetimelike,
237237
bint skipna=True,
238+
const uint8_t[:, :] mask=None,
239+
uint8_t[:, ::1] result_mask=None,
238240
) -> None:
239241
"""
240242
Cumulative product of columns of `values`, in row groups `labels`.
@@ -253,19 +255,28 @@ def group_cumprod_float64(
253255
Always false, `values` is never datetime-like.
254256
skipna : bool
255257
If true, ignore nans in `values`.
258+
mask: np.ndarray[uint8], optional
259+
Mask of values
260+
result_mask: np.ndarray[int8], optional
261+
Mask of out array
256262

257263
Notes
258264
-----
259265
This method modifies the `out` parameter, rather than returning an object.
260266
"""
261267
cdef:
262268
Py_ssize_t i, j, N, K, size
263-
float64_t val
264-
float64_t[:, ::1] accum
269+
int64float_t val, na_val
270+
int64float_t[:, ::1] accum
265271
intp_t lab
272+
uint8_t[:, ::1] accum_mask
273+
bint isna_entry, isna_prev = False
274+
bint uses_mask = mask is not None
266275

267276
N, K = (<object>values).shape
268-
accum = np.ones((ngroups, K), dtype=np.float64)
277+
accum = np.ones((ngroups, K), dtype=(<object>values).dtype)
278+
na_val = _get_na_val(<int64float_t>0, is_datetimelike)
279+
accum_mask = np.zeros((ngroups, K), dtype="uint8")
269280

270281
with nogil:
271282
for i in range(N):
@@ -275,13 +286,35 @@ def group_cumprod_float64(
275286
continue
276287
for j in range(K):
277288
val = values[i, j]
278-
if val == val:
279-
accum[lab, j] *= val
280-
out[i, j] = accum[lab, j]
289+
290+
if uses_mask:
291+
isna_entry = mask[i, j]
292+
elif int64float_t is float64_t or int64float_t is float32_t:
293+
isna_entry = val != val
281294
else:
282-
out[i, j] = NaN
295+
isna_entry = False
296+
297+
if not isna_entry:
298+
isna_prev = accum_mask[lab, j]
299+
if isna_prev:
300+
out[i, j] = na_val
301+
if uses_mask:
302+
result_mask[i, j] = True
303+
304+
else:
305+
accum[lab, j] *= val
306+
out[i, j] = accum[lab, j]
307+
308+
else:
309+
if uses_mask:
310+
result_mask[i, j] = True
311+
out[i, j] = 0
312+
else:
313+
out[i, j] = na_val
314+
283315
if not skipna:
284-
accum[lab, j] = NaN
316+
accum[lab, j] = na_val
317+
accum_mask[lab, j] = True
285318

286319

287320
@cython.boundscheck(False)

pandas/core/groupby/ops.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
138138
"ohlc": "group_ohlc",
139139
},
140140
"transform": {
141-
"cumprod": "group_cumprod_float64",
141+
"cumprod": "group_cumprod",
142142
"cumsum": "group_cumsum",
143143
"cummin": "group_cummin",
144144
"cummax": "group_cummax",
@@ -158,6 +158,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
158158
"rank",
159159
"sum",
160160
"ohlc",
161+
"cumprod",
161162
"cumsum",
162163
"prod",
163164
"mean",
@@ -218,7 +219,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
218219
"""
219220
how = self.how
220221

221-
if how in ["median", "cumprod"]:
222+
if how in ["median"]:
222223
# these two only have float64 implementations
223224
# We should only get here with is_numeric, as non-numeric cases
224225
# should raise in _get_cython_function
@@ -231,7 +232,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
231232
# result may still include NaN, so we have to cast
232233
values = ensure_float64(values)
233234

234-
elif how in ["sum", "ohlc", "prod", "cumsum"]:
235+
elif how in ["sum", "ohlc", "prod", "cumsum", "cumprod"]:
235236
# Avoid overflow during group op
236237
if values.dtype.kind == "i":
237238
values = ensure_int64(values)
@@ -330,7 +331,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype:
330331
"""
331332
how = self.how
332333

333-
if how in ["sum", "cumsum", "sum", "prod"]:
334+
if how in ["sum", "cumsum", "sum", "prod", "cumprod"]:
334335
if dtype == np.dtype(bool):
335336
return np.dtype(np.int64)
336337
elif how in ["mean", "median", "var"]:

pandas/tests/groupby/test_function.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -641,15 +641,30 @@ def test_groupby_cumprod():
641641
tm.assert_series_equal(actual, expected)
642642

643643
df = DataFrame({"key": ["b"] * 100, "value": 2})
644-
actual = df.groupby("key")["value"].cumprod()
645-
# if overflows, groupby product casts to float
646-
# while numpy passes back invalid values
647644
df["value"] = df["value"].astype(float)
645+
actual = df.groupby("key")["value"].cumprod()
648646
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
649647
expected.name = "value"
650648
tm.assert_series_equal(actual, expected)
651649

652650

651+
def test_groupby_cumprod_overflow():
652+
# GH#37493 if we overflow we return garbage consistent with numpy
653+
df = DataFrame({"key": ["b"] * 4, "value": 100_000})
654+
actual = df.groupby("key")["value"].cumprod()
655+
expected = Series(
656+
[100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
657+
name="value",
658+
)
659+
tm.assert_series_equal(actual, expected)
660+
661+
numpy_result = df.groupby("key", group_keys=False)["value"].apply(
662+
lambda x: x.cumprod()
663+
)
664+
numpy_result.name = "value"
665+
tm.assert_series_equal(actual, numpy_result)
666+
667+
653668
def test_groupby_cumprod_nan_influences_other_columns():
654669
# GH#48064
655670
df = DataFrame(

pandas/tests/groupby/test_groupby.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2847,14 +2847,14 @@ def test_single_element_list_grouping():
28472847
values, _ = next(iter(df.groupby(["a"])))
28482848

28492849

2850-
@pytest.mark.parametrize("func", ["sum", "cumsum", "prod"])
2850+
@pytest.mark.parametrize("func", ["sum", "cumsum", "cumprod", "prod"])
28512851
def test_groupby_avoid_casting_to_float(func):
28522852
# GH#37493
28532853
val = 922337203685477580
28542854
df = DataFrame({"a": 1, "b": [val]})
28552855
result = getattr(df.groupby("a"), func)() - val
28562856
expected = DataFrame({"b": [0]}, index=Index([1], name="a"))
2857-
if func == "cumsum":
2857+
if func in ["cumsum", "cumprod"]:
28582858
expected = expected.reset_index(drop=True)
28592859
tm.assert_frame_equal(result, expected)
28602860

pandas/tests/groupby/test_libgroupby.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from pandas._libs import groupby as libgroupby
55
from pandas._libs.groupby import (
6-
group_cumprod_float64,
6+
group_cumprod,
77
group_cumsum,
88
group_mean,
99
group_var,
@@ -194,7 +194,7 @@ def test_cython_group_transform_cumsum(np_dtype):
194194
def test_cython_group_transform_cumprod():
195195
# see gh-4095
196196
dtype = np.float64
197-
pd_op, np_op = group_cumprod_float64, np.cumproduct
197+
pd_op, np_op = group_cumprod, np.cumproduct
198198
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
199199

200200

@@ -209,7 +209,7 @@ def test_cython_group_transform_algos():
209209
data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
210210
actual = np.zeros_like(data)
211211
actual.fill(np.nan)
212-
group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike)
212+
group_cumprod(actual, data, labels, ngroups, is_datetimelike)
213213
expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
214214
tm.assert_numpy_array_equal(actual[:, 0], expected)
215215

0 commit comments

Comments
 (0)