Skip to content

ENH: Support mask in groupby cumprod #48138

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Sep 19, 2022
31 changes: 28 additions & 3 deletions doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,33 @@ These are bug fixes that might have notable behavior changes.

.. _whatsnew_160.notable_bug_fixes.notable_bug_fix1:

notable_bug_fix1
^^^^^^^^^^^^^^^^
:meth:`.GroupBy.cumsum` and :meth:`.GroupBy.cumprod` overflow instead of lossy casting to float
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In previous versions we cast to float when applying ``cumsum`` and ``cumprod`` which
lead to incorrect results even if the result could be hold by ``int64`` dtype.
Additionally, the aggregation overflows consistent with numpy and the regular
:meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods when the limit of
``int64`` is reached (:issue:`37493`).

*Old Behavior*

.. code-block:: ipython

In [1]: df = pd.DataFrame({"key": ["b"] * 7, "value": 625})
In [2]: df.groupby("key")["value"].cumprod()[5]
Out[2]: 5.960464477539062e+16

We return incorrect results with the 6th value.

*New Behavior*

.. ipython:: python

df = pd.DataFrame({"key": ["b"] * 7, "value": 625})
df.groupby("key")["value"].cumprod()

We overflow with the 7th value, but the 6th value is still correct.

.. _whatsnew_160.notable_bug_fixes.notable_bug_fix2:

Expand Down Expand Up @@ -103,7 +128,7 @@ Deprecations

Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance improvement in :meth:`.DataFrameGroupBy.median` and :meth:`.SeriesGroupBy.median` for nullable dtypes (:issue:`37493`)
- Performance improvement in :meth:`.DataFrameGroupBy.median` and :meth:`.SeriesGroupBy.median` and :meth:`.GroupBy.cumprod` for nullable dtypes (:issue:`37493`)
- Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`)
- Performance improvement in :meth:`.DataFrameGroupBy.mean`, :meth:`.SeriesGroupBy.mean`, :meth:`.DataFrameGroupBy.var`, and :meth:`.SeriesGroupBy.var` for extension array dtypes (:issue:`37493`)
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ def group_median_float64(
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_cumprod_float64(
def group_cumprod(
out: np.ndarray, # float64_t[:, ::1]
values: np.ndarray, # const float64_t[:, :]
labels: np.ndarray, # const int64_t[:]
ngroups: int,
is_datetimelike: bool,
skipna: bool = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_cumsum(
out: np.ndarray, # int64float_t[:, ::1]
Expand Down
55 changes: 44 additions & 11 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -228,13 +228,15 @@ def group_median_float64(

@cython.boundscheck(False)
@cython.wraparound(False)
def group_cumprod_float64(
float64_t[:, ::1] out,
const float64_t[:, :] values,
def group_cumprod(
int64float_t[:, ::1] out,
ndarray[int64float_t, ndim=2] values,
const intp_t[::1] labels,
int ngroups,
bint is_datetimelike,
bint skipna=True,
const uint8_t[:, :] mask=None,
uint8_t[:, ::1] result_mask=None,
) -> None:
"""
Cumulative product of columns of `values`, in row groups `labels`.
Expand All @@ -253,19 +255,28 @@ def group_cumprod_float64(
Always false, `values` is never datetime-like.
skipna : bool
If true, ignore nans in `values`.
mask: np.ndarray[uint8], optional
Mask of values
result_mask: np.ndarray[int8], optional
Mask of out array

Notes
-----
This method modifies the `out` parameter, rather than returning an object.
"""
cdef:
Py_ssize_t i, j, N, K, size
float64_t val
float64_t[:, ::1] accum
int64float_t val, na_val
int64float_t[:, ::1] accum
intp_t lab
uint8_t[:, ::1] accum_mask
bint isna_entry, isna_prev = False
bint uses_mask = mask is not None

N, K = (<object>values).shape
accum = np.ones((ngroups, K), dtype=np.float64)
accum = np.ones((ngroups, K), dtype=(<object>values).dtype)
na_val = _get_na_val(<int64float_t>0, is_datetimelike)
accum_mask = np.zeros((ngroups, K), dtype="uint8")

with nogil:
for i in range(N):
Expand All @@ -275,13 +286,35 @@ def group_cumprod_float64(
continue
for j in range(K):
val = values[i, j]
if val == val:
accum[lab, j] *= val
out[i, j] = accum[lab, j]

if uses_mask:
isna_entry = mask[i, j]
elif int64float_t is float64_t or int64float_t is float32_t:
isna_entry = val != val
else:
out[i, j] = NaN
isna_entry = False

if not isna_entry:
isna_prev = accum_mask[lab, j]
if isna_prev:
out[i, j] = na_val
if uses_mask:
result_mask[i, j] = True

else:
accum[lab, j] *= val
out[i, j] = accum[lab, j]

else:
if uses_mask:
result_mask[i, j] = True
out[i, j] = 0
else:
out[i, j] = na_val

if not skipna:
accum[lab, j] = NaN
accum[lab, j] = na_val
accum_mask[lab, j] = True


@cython.boundscheck(False)
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
"ohlc": "group_ohlc",
},
"transform": {
"cumprod": "group_cumprod_float64",
"cumprod": "group_cumprod",
"cumsum": "group_cumsum",
"cummin": "group_cummin",
"cummax": "group_cummax",
Expand All @@ -158,6 +158,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
"rank",
"sum",
"ohlc",
"cumprod",
"cumsum",
"prod",
"mean",
Expand Down Expand Up @@ -218,7 +219,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
"""
how = self.how

if how in ["median", "cumprod"]:
if how in ["median"]:
# these two only have float64 implementations
# We should only get here with is_numeric, as non-numeric cases
# should raise in _get_cython_function
Expand All @@ -231,7 +232,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
# result may still include NaN, so we have to cast
values = ensure_float64(values)

elif how in ["sum", "ohlc", "prod", "cumsum"]:
elif how in ["sum", "ohlc", "prod", "cumsum", "cumprod"]:
# Avoid overflow during group op
if values.dtype.kind == "i":
values = ensure_int64(values)
Expand Down Expand Up @@ -330,7 +331,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype:
"""
how = self.how

if how in ["sum", "cumsum", "sum", "prod"]:
if how in ["sum", "cumsum", "sum", "prod", "cumprod"]:
if dtype == np.dtype(bool):
return np.dtype(np.int64)
elif how in ["mean", "median", "var"]:
Expand Down
21 changes: 18 additions & 3 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,15 +641,30 @@ def test_groupby_cumprod():
tm.assert_series_equal(actual, expected)

df = DataFrame({"key": ["b"] * 100, "value": 2})
actual = df.groupby("key")["value"].cumprod()
# if overflows, groupby product casts to float
# while numpy passes back invalid values
df["value"] = df["value"].astype(float)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can maybe keep this with as int (or test both in addition), so we have a test for the silent overflow behaviour?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a new test explicitly testing that overflow is consistent with numpy

actual = df.groupby("key")["value"].cumprod()
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
expected.name = "value"
tm.assert_series_equal(actual, expected)


def test_groupby_cumprod_overflow():
# GH#37493 if we overflow we return garbage consistent with numpy
df = DataFrame({"key": ["b"] * 4, "value": 100_000})
actual = df.groupby("key")["value"].cumprod()
expected = Series(
[100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
name="value",
)
tm.assert_series_equal(actual, expected)

numpy_result = df.groupby("key", group_keys=False)["value"].apply(
lambda x: x.cumprod()
)
numpy_result.name = "value"
tm.assert_series_equal(actual, numpy_result)


def test_groupby_cumprod_nan_influences_other_columns():
# GH#48064
df = DataFrame(
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2847,14 +2847,14 @@ def test_single_element_list_grouping():
values, _ = next(iter(df.groupby(["a"])))


@pytest.mark.parametrize("func", ["sum", "cumsum", "prod"])
@pytest.mark.parametrize("func", ["sum", "cumsum", "cumprod", "prod"])
def test_groupby_avoid_casting_to_float(func):
# GH#37493
val = 922337203685477580
df = DataFrame({"a": 1, "b": [val]})
result = getattr(df.groupby("a"), func)() - val
expected = DataFrame({"b": [0]}, index=Index([1], name="a"))
if func == "cumsum":
if func in ["cumsum", "cumprod"]:
expected = expected.reset_index(drop=True)
tm.assert_frame_equal(result, expected)

Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/groupby/test_libgroupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from pandas._libs import groupby as libgroupby
from pandas._libs.groupby import (
group_cumprod_float64,
group_cumprod,
group_cumsum,
group_mean,
group_var,
Expand Down Expand Up @@ -194,7 +194,7 @@ def test_cython_group_transform_cumsum(np_dtype):
def test_cython_group_transform_cumprod():
# see gh-4095
dtype = np.float64
pd_op, np_op = group_cumprod_float64, np.cumproduct
pd_op, np_op = group_cumprod, np.cumproduct
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)


Expand All @@ -209,7 +209,7 @@ def test_cython_group_transform_algos():
data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
actual = np.zeros_like(data)
actual.fill(np.nan)
group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike)
group_cumprod(actual, data, labels, ngroups, is_datetimelike)
expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
tm.assert_numpy_array_equal(actual[:, 0], expected)

Expand Down