diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 405b8cc0a5ded..1dc24c89d825e 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -44,8 +44,33 @@ These are bug fixes that might have notable behavior changes. .. _whatsnew_160.notable_bug_fixes.notable_bug_fix1: -notable_bug_fix1 -^^^^^^^^^^^^^^^^ +:meth:`.GroupBy.cumsum` and :meth:`.GroupBy.cumprod` overflow instead of lossy casting to float +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions we cast to float when applying ``cumsum`` and ``cumprod`` which +lead to incorrect results even if the result could be hold by ``int64`` dtype. +Additionally, the aggregation overflows consistent with numpy and the regular +:meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods when the limit of +``int64`` is reached (:issue:`37493`). + +*Old Behavior* + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({"key": ["b"] * 7, "value": 625}) + In [2]: df.groupby("key")["value"].cumprod()[5] + Out[2]: 5.960464477539062e+16 + +We return incorrect results with the 6th value. + +*New Behavior* + +.. ipython:: python + + df = pd.DataFrame({"key": ["b"] * 7, "value": 625}) + df.groupby("key")["value"].cumprod() + +We overflow with the 7th value, but the 6th value is still correct. .. _whatsnew_160.notable_bug_fixes.notable_bug_fix2: @@ -103,7 +128,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvement in :meth:`.DataFrameGroupBy.median` and :meth:`.SeriesGroupBy.median` for nullable dtypes (:issue:`37493`) +- Performance improvement in :meth:`.DataFrameGroupBy.median` and :meth:`.SeriesGroupBy.median` and :meth:`.GroupBy.cumprod` for nullable dtypes (:issue:`37493`) - Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`) - Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`) - Performance improvement in :meth:`.DataFrameGroupBy.mean`, :meth:`.SeriesGroupBy.mean`, :meth:`.DataFrameGroupBy.var`, and :meth:`.SeriesGroupBy.var` for extension array dtypes (:issue:`37493`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 91a7486bdc590..0f72b2b72141f 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -13,13 +13,15 @@ def group_median_float64( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... -def group_cumprod_float64( +def group_cumprod( out: np.ndarray, # float64_t[:, ::1] values: np.ndarray, # const float64_t[:, :] labels: np.ndarray, # const int64_t[:] ngroups: int, is_datetimelike: bool, skipna: bool = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_cumsum( out: np.ndarray, # int64float_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index cd1b84692cf5b..f798655e9d922 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -228,13 +228,15 @@ def group_median_float64( @cython.boundscheck(False) @cython.wraparound(False) -def group_cumprod_float64( - float64_t[:, ::1] out, - const float64_t[:, :] values, +def group_cumprod( + int64float_t[:, ::1] out, + ndarray[int64float_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, bint skipna=True, + const uint8_t[:, :] mask=None, + uint8_t[:, ::1] result_mask=None, ) -> None: """ Cumulative product of columns of `values`, in row groups `labels`. @@ -253,6 +255,10 @@ def group_cumprod_float64( Always false, `values` is never datetime-like. skipna : bool If true, ignore nans in `values`. + mask: np.ndarray[uint8], optional + Mask of values + result_mask: np.ndarray[int8], optional + Mask of out array Notes ----- @@ -260,12 +266,17 @@ def group_cumprod_float64( """ cdef: Py_ssize_t i, j, N, K, size - float64_t val - float64_t[:, ::1] accum + int64float_t val, na_val + int64float_t[:, ::1] accum intp_t lab + uint8_t[:, ::1] accum_mask + bint isna_entry, isna_prev = False + bint uses_mask = mask is not None N, K = (values).shape - accum = np.ones((ngroups, K), dtype=np.float64) + accum = np.ones((ngroups, K), dtype=(values).dtype) + na_val = _get_na_val(0, is_datetimelike) + accum_mask = np.zeros((ngroups, K), dtype="uint8") with nogil: for i in range(N): @@ -275,13 +286,35 @@ def group_cumprod_float64( continue for j in range(K): val = values[i, j] - if val == val: - accum[lab, j] *= val - out[i, j] = accum[lab, j] + + if uses_mask: + isna_entry = mask[i, j] + elif int64float_t is float64_t or int64float_t is float32_t: + isna_entry = val != val else: - out[i, j] = NaN + isna_entry = False + + if not isna_entry: + isna_prev = accum_mask[lab, j] + if isna_prev: + out[i, j] = na_val + if uses_mask: + result_mask[i, j] = True + + else: + accum[lab, j] *= val + out[i, j] = accum[lab, j] + + else: + if uses_mask: + result_mask[i, j] = True + out[i, j] = 0 + else: + out[i, j] = na_val + if not skipna: - accum[lab, j] = NaN + accum[lab, j] = na_val + accum_mask[lab, j] = True @cython.boundscheck(False) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ba808e1f2e07f..597932a55f897 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -138,7 +138,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "ohlc": "group_ohlc", }, "transform": { - "cumprod": "group_cumprod_float64", + "cumprod": "group_cumprod", "cumsum": "group_cumsum", "cummin": "group_cummin", "cummax": "group_cummax", @@ -158,6 +158,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "rank", "sum", "ohlc", + "cumprod", "cumsum", "prod", "mean", @@ -218,7 +219,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how in ["median", "cumprod"]: + if how in ["median"]: # these two only have float64 implementations # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function @@ -231,7 +232,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: # result may still include NaN, so we have to cast values = ensure_float64(values) - elif how in ["sum", "ohlc", "prod", "cumsum"]: + elif how in ["sum", "ohlc", "prod", "cumsum", "cumprod"]: # Avoid overflow during group op if values.dtype.kind == "i": values = ensure_int64(values) @@ -330,7 +331,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: """ how = self.how - if how in ["sum", "cumsum", "sum", "prod"]: + if how in ["sum", "cumsum", "sum", "prod", "cumprod"]: if dtype == np.dtype(bool): return np.dtype(np.int64) elif how in ["mean", "median", "var"]: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index d813a2848a5dc..cdbb121819c5e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -641,15 +641,30 @@ def test_groupby_cumprod(): tm.assert_series_equal(actual, expected) df = DataFrame({"key": ["b"] * 100, "value": 2}) - actual = df.groupby("key")["value"].cumprod() - # if overflows, groupby product casts to float - # while numpy passes back invalid values df["value"] = df["value"].astype(float) + actual = df.groupby("key")["value"].cumprod() expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) expected.name = "value" tm.assert_series_equal(actual, expected) +def test_groupby_cumprod_overflow(): + # GH#37493 if we overflow we return garbage consistent with numpy + df = DataFrame({"key": ["b"] * 4, "value": 100_000}) + actual = df.groupby("key")["value"].cumprod() + expected = Series( + [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920], + name="value", + ) + tm.assert_series_equal(actual, expected) + + numpy_result = df.groupby("key", group_keys=False)["value"].apply( + lambda x: x.cumprod() + ) + numpy_result.name = "value" + tm.assert_series_equal(actual, numpy_result) + + def test_groupby_cumprod_nan_influences_other_columns(): # GH#48064 df = DataFrame( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ba39f76203623..5a65e79849cdb 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2847,14 +2847,14 @@ def test_single_element_list_grouping(): values, _ = next(iter(df.groupby(["a"]))) -@pytest.mark.parametrize("func", ["sum", "cumsum", "prod"]) +@pytest.mark.parametrize("func", ["sum", "cumsum", "cumprod", "prod"]) def test_groupby_avoid_casting_to_float(func): # GH#37493 val = 922337203685477580 df = DataFrame({"a": 1, "b": [val]}) result = getattr(df.groupby("a"), func)() - val expected = DataFrame({"b": [0]}, index=Index([1], name="a")) - if func == "cumsum": + if func in ["cumsum", "cumprod"]: expected = expected.reset_index(drop=True) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py index 24abbd0f65795..4cea6d10af9e0 100644 --- a/pandas/tests/groupby/test_libgroupby.py +++ b/pandas/tests/groupby/test_libgroupby.py @@ -3,7 +3,7 @@ from pandas._libs import groupby as libgroupby from pandas._libs.groupby import ( - group_cumprod_float64, + group_cumprod, group_cumsum, group_mean, group_var, @@ -194,7 +194,7 @@ def test_cython_group_transform_cumsum(np_dtype): def test_cython_group_transform_cumprod(): # see gh-4095 dtype = np.float64 - pd_op, np_op = group_cumprod_float64, np.cumproduct + pd_op, np_op = group_cumprod, np.cumproduct _check_cython_group_transform_cumulative(pd_op, np_op, dtype) @@ -209,7 +209,7 @@ def test_cython_group_transform_algos(): data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") actual = np.zeros_like(data) actual.fill(np.nan) - group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike) + group_cumprod(actual, data, labels, ngroups, is_datetimelike) expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") tm.assert_numpy_array_equal(actual[:, 0], expected)