ENH: Support mask in groupby cumprod (#48138)

phofl · web-flow · commit f19aeaf49762 · 2022-09-19T16:18:14.000-07:00
* ENH: Support mask in groupby cumprod

* Add whatsnew

* Move whatsnew

* Adress review

* Fix example

* Clarify

* Change dtype access
diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst
@@ -45,8 +45,33 @@ These are bug fixes that might have notable behavior changes.
 
 .. _whatsnew_160.notable_bug_fixes.notable_bug_fix1:
 
-notable_bug_fix1
-^^^^^^^^^^^^^^^^
+:meth:`.GroupBy.cumsum` and :meth:`.GroupBy.cumprod` overflow instead of lossy casting to float
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions we cast to float when applying ``cumsum`` and ``cumprod`` which
+lead to incorrect results even if the result could be hold by ``int64`` dtype.
+Additionally, the aggregation overflows consistent with numpy and the regular
+:meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods when the limit of
+``int64`` is reached (:issue:`37493`).
+
+*Old Behavior*
+
+.. code-block:: ipython
+
+    In [1]: df = pd.DataFrame({"key": ["b"] * 7, "value": 625})
+    In [2]: df.groupby("key")["value"].cumprod()[5]
+    Out[2]: 5.960464477539062e+16
+
+We return incorrect results with the 6th value.
+
+*New Behavior*
+
+.. ipython:: python
+
+    df = pd.DataFrame({"key": ["b"] * 7, "value": 625})
+    df.groupby("key")["value"].cumprod()
+
+We overflow with the 7th value, but the 6th value is still correct.
 
 .. _whatsnew_160.notable_bug_fixes.notable_bug_fix2:
 
@@ -104,7 +129,7 @@ Deprecations
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
-- Performance improvement in :meth:`.DataFrameGroupBy.median` and :meth:`.SeriesGroupBy.median` for nullable dtypes (:issue:`37493`)
+- Performance improvement in :meth:`.DataFrameGroupBy.median` and :meth:`.SeriesGroupBy.median` and :meth:`.GroupBy.cumprod` for nullable dtypes (:issue:`37493`)
 - Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
 - Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`)
 - Performance improvement in :meth:`.DataFrameGroupBy.mean`, :meth:`.SeriesGroupBy.mean`, :meth:`.DataFrameGroupBy.var`, and :meth:`.SeriesGroupBy.var` for extension array dtypes (:issue:`37493`)
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -13,13 +13,15 @@ def group_median_float64(
     mask: np.ndarray | None = ...,
     result_mask: np.ndarray | None = ...,
 ) -> None: ...
-def group_cumprod_float64(
+def group_cumprod(
     out: np.ndarray,  # float64_t[:, ::1]
     values: np.ndarray,  # const float64_t[:, :]
     labels: np.ndarray,  # const int64_t[:]
     ngroups: int,
     is_datetimelike: bool,
     skipna: bool = ...,
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
 ) -> None: ...
 def group_cumsum(
     out: np.ndarray,  # int64float_t[:, ::1]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -228,13 +228,15 @@ def group_median_float64(
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cumprod_float64(
-    float64_t[:, ::1] out,
-    const float64_t[:, :] values,
+def group_cumprod(
+    int64float_t[:, ::1] out,
+    ndarray[int64float_t, ndim=2] values,
     const intp_t[::1] labels,
     int ngroups,
     bint is_datetimelike,
     bint skipna=True,
+    const uint8_t[:, :] mask=None,
+    uint8_t[:, ::1] result_mask=None,
 ) -> None:
     """
     Cumulative product of columns of `values`, in row groups `labels`.
@@ -253,19 +255,28 @@ def group_cumprod_float64(
         Always false, `values` is never datetime-like.
     skipna : bool
         If true, ignore nans in `values`.
+    mask: np.ndarray[uint8], optional
+        Mask of values
+    result_mask: np.ndarray[int8], optional
+        Mask of out array
 
     Notes
     -----
     This method modifies the `out` parameter, rather than returning an object.
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        float64_t val
-        float64_t[:, ::1] accum
+        int64float_t val, na_val
+        int64float_t[:, ::1] accum
         intp_t lab
+        uint8_t[:, ::1] accum_mask
+        bint isna_entry, isna_prev = False
+        bint uses_mask = mask is not None
 
     N, K = (<object>values).shape
-    accum = np.ones((ngroups, K), dtype=np.float64)
+    accum = np.ones((ngroups, K), dtype=(<object>values).dtype)
+    na_val = _get_na_val(<int64float_t>0, is_datetimelike)
+    accum_mask = np.zeros((ngroups, K), dtype="uint8")
 
     with nogil:
         for i in range(N):
@@ -275,13 +286,35 @@ def group_cumprod_float64(
                 continue
             for j in range(K):
                 val = values[i, j]
-                if val == val:
-                    accum[lab, j] *= val
-                    out[i, j] = accum[lab, j]
+
+                if uses_mask:
+                    isna_entry = mask[i, j]
+                elif int64float_t is float64_t or int64float_t is float32_t:
+                    isna_entry = val != val
                 else:
-                    out[i, j] = NaN
+                    isna_entry = False
+
+                if not isna_entry:
+                    isna_prev = accum_mask[lab, j]
+                    if isna_prev:
+                        out[i, j] = na_val
+                        if uses_mask:
+                            result_mask[i, j] = True
+
+                    else:
+                        accum[lab, j] *= val
+                        out[i, j] = accum[lab, j]
+
+                else:
+                    if uses_mask:
+                        result_mask[i, j] = True
+                        out[i, j] = 0
+                    else:
+                        out[i, j] = na_val
+
                     if not skipna:
-                        accum[lab, j] = NaN
+                        accum[lab, j] = na_val
+                        accum_mask[lab, j] = True
 
 
 @cython.boundscheck(False)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -138,7 +138,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
             "ohlc": "group_ohlc",
         },
         "transform": {
-            "cumprod": "group_cumprod_float64",
+            "cumprod": "group_cumprod",
             "cumsum": "group_cumsum",
             "cummin": "group_cummin",
             "cummax": "group_cummax",
@@ -158,6 +158,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
         "rank",
         "sum",
         "ohlc",
+        "cumprod",
         "cumsum",
         "prod",
         "mean",
@@ -218,7 +219,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
         """
         how = self.how
 
-        if how in ["median", "cumprod"]:
+        if how in ["median"]:
             # these two only have float64 implementations
             # We should only get here with is_numeric, as non-numeric cases
             #  should raise in _get_cython_function
@@ -231,7 +232,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
                 # result may still include NaN, so we have to cast
                 values = ensure_float64(values)
 
-            elif how in ["sum", "ohlc", "prod", "cumsum"]:
+            elif how in ["sum", "ohlc", "prod", "cumsum", "cumprod"]:
                 # Avoid overflow during group op
                 if values.dtype.kind == "i":
                     values = ensure_int64(values)
@@ -330,7 +331,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype:
         """
         how = self.how
 
-        if how in ["sum", "cumsum", "sum", "prod"]:
+        if how in ["sum", "cumsum", "sum", "prod", "cumprod"]:
             if dtype == np.dtype(bool):
                 return np.dtype(np.int64)
         elif how in ["mean", "median", "var"]:
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -641,15 +641,30 @@ def test_groupby_cumprod():
     tm.assert_series_equal(actual, expected)
 
     df = DataFrame({"key": ["b"] * 100, "value": 2})
-    actual = df.groupby("key")["value"].cumprod()
-    # if overflows, groupby product casts to float
-    # while numpy passes back invalid values
     df["value"] = df["value"].astype(float)
+    actual = df.groupby("key")["value"].cumprod()
     expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
     expected.name = "value"
     tm.assert_series_equal(actual, expected)
 
 
+def test_groupby_cumprod_overflow():
+    # GH#37493 if we overflow we return garbage consistent with numpy
+    df = DataFrame({"key": ["b"] * 4, "value": 100_000})
+    actual = df.groupby("key")["value"].cumprod()
+    expected = Series(
+        [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
+        name="value",
+    )
+    tm.assert_series_equal(actual, expected)
+
+    numpy_result = df.groupby("key", group_keys=False)["value"].apply(
+        lambda x: x.cumprod()
+    )
+    numpy_result.name = "value"
+    tm.assert_series_equal(actual, numpy_result)
+
+
 def test_groupby_cumprod_nan_influences_other_columns():
     # GH#48064
     df = DataFrame(
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2847,14 +2847,14 @@ def test_single_element_list_grouping():
         values, _ = next(iter(df.groupby(["a"])))
 
 
-@pytest.mark.parametrize("func", ["sum", "cumsum", "prod"])
+@pytest.mark.parametrize("func", ["sum", "cumsum", "cumprod", "prod"])
 def test_groupby_avoid_casting_to_float(func):
     # GH#37493
     val = 922337203685477580
     df = DataFrame({"a": 1, "b": [val]})
     result = getattr(df.groupby("a"), func)() - val
     expected = DataFrame({"b": [0]}, index=Index([1], name="a"))
-    if func == "cumsum":
+    if func in ["cumsum", "cumprod"]:
         expected = expected.reset_index(drop=True)
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py
@@ -3,7 +3,7 @@
 
 from pandas._libs import groupby as libgroupby
 from pandas._libs.groupby import (
-    group_cumprod_float64,
+    group_cumprod,
     group_cumsum,
     group_mean,
     group_var,
@@ -194,7 +194,7 @@ def test_cython_group_transform_cumsum(np_dtype):
 def test_cython_group_transform_cumprod():
     # see gh-4095
     dtype = np.float64
-    pd_op, np_op = group_cumprod_float64, np.cumproduct
+    pd_op, np_op = group_cumprod, np.cumproduct
     _check_cython_group_transform_cumulative(pd_op, np_op, dtype)
 
 
@@ -209,7 +209,7 @@ def test_cython_group_transform_algos():
     data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
     actual = np.zeros_like(data)
     actual.fill(np.nan)
-    group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike)
+    group_cumprod(actual, data, labels, ngroups, is_datetimelike)
     expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
     tm.assert_numpy_array_equal(actual[:, 0], expected)