From 545269852096fc3104bcdf5cda42ca5ef215f562 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 18 Aug 2022 23:06:55 +0200 Subject: [PATCH 1/7] ENH: Support mask in groupby cumprod --- pandas/_libs/groupby.pyi | 4 +- pandas/_libs/groupby.pyx | 67 ++++++++++++++++++------- pandas/core/groupby/ops.py | 9 ++-- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/groupby/test_groupby.py | 4 +- pandas/tests/groupby/test_libgroupby.py | 6 +-- 6 files changed, 64 insertions(+), 28 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index c8e9df6cd6b38..4a3f47b5024b0 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -11,13 +11,15 @@ def group_median_float64( labels: npt.NDArray[np.int64], min_count: int = ..., # Py_ssize_t ) -> None: ... -def group_cumprod_float64( +def group_cumprod( out: np.ndarray, # float64_t[:, ::1] values: np.ndarray, # const float64_t[:, :] labels: np.ndarray, # const int64_t[:] ngroups: int, is_datetimelike: bool, skipna: bool = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_cumsum( out: np.ndarray, # int64float_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 563abf949dbbc..bc4acc1971092 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -148,15 +148,24 @@ def group_median_float64( ptr += size +ctypedef fused int64float_t: + int64_t + uint64_t + float32_t + float64_t + + @cython.boundscheck(False) @cython.wraparound(False) -def group_cumprod_float64( - float64_t[:, ::1] out, - const float64_t[:, :] values, +def group_cumprod( + int64float_t[:, ::1] out, + ndarray[int64float_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, bint skipna=True, + const uint8_t[:, :] mask=None, + uint8_t[:, ::1] result_mask=None, ) -> None: """ Cumulative product of columns of `values`, in row groups `labels`. @@ -175,6 +184,10 @@ def group_cumprod_float64( Always false, `values` is never datetime-like. skipna : bool If true, ignore nans in `values`. + mask: np.ndarray[uint8], optional + Mask of values + result_mask: np.ndarray[int8], optional + Mask of out array Notes ----- @@ -182,12 +195,17 @@ def group_cumprod_float64( """ cdef: Py_ssize_t i, j, N, K, size - float64_t val - float64_t[:, ::1] accum + int64float_t val, na_val + int64float_t[:, ::1] accum intp_t lab + uint8_t[:, ::1] accum_mask + bint isna_entry, isna_prev = False + bint uses_mask = mask is not None N, K = (values).shape - accum = np.ones((ngroups, K), dtype=np.float64) + accum = np.ones((ngroups, K), dtype=np.asarray(values).dtype) + na_val = _get_na_val(0, is_datetimelike) + accum_mask = np.zeros((ngroups, K), dtype="uint8") with nogil: for i in range(N): @@ -197,20 +215,35 @@ def group_cumprod_float64( continue for j in range(K): val = values[i, j] - if val == val: - accum[lab, j] *= val - out[i, j] = accum[lab, j] + + if uses_mask: + isna_entry = mask[i, j] + elif int64float_t is float64_t or int64float_t is float32_t: + isna_entry = val != val else: - out[i, j] = NaN - if not skipna: - accum[lab, j] = NaN + isna_entry = False + if not isna_entry: + isna_prev = accum_mask[lab, j] + if isna_prev: + out[i, j] = na_val + if uses_mask: + result_mask[i, j] = True -ctypedef fused int64float_t: - int64_t - uint64_t - float32_t - float64_t + else: + accum[lab, j] *= val + out[i, j] = accum[lab, j] + + else: + if uses_mask: + result_mask[i, j] = True + out[i, j] = 0 + else: + out[i, j] = na_val + + if not skipna: + accum[lab, j] = na_val + accum_mask[lab, j] = True @cython.boundscheck(False) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 13df7ba465741..9035351d3ea67 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -138,7 +138,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "ohlc": "group_ohlc", }, "transform": { - "cumprod": "group_cumprod_float64", + "cumprod": "group_cumprod", "cumsum": "group_cumsum", "cummin": "group_cummin", "cummax": "group_cummax", @@ -158,6 +158,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "rank", "sum", "ohlc", + "cumprod", "cumsum", } @@ -214,7 +215,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how in ["median", "cumprod"]: + if how in ["median"]: # these two only have float64 implementations # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function @@ -227,7 +228,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: # result may still include NaN, so we have to cast values = ensure_float64(values) - elif how in ["sum", "ohlc", "cumsum"]: + elif how in ["sum", "ohlc", "cumsum", "cumprod"]: # Avoid overflow during group op if values.dtype.kind == "i": values = ensure_int64(values) @@ -326,7 +327,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: """ how = self.how - if how in ["sum", "cumsum", "sum", "prod"]: + if how in ["sum", "cumsum", "sum", "prod", "cumprod"]: if dtype == np.dtype(bool): return np.dtype(np.int64) elif how in ["mean", "median", "var"]: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 4b9f5deb40849..9e3b34129d4c7 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -641,10 +641,10 @@ def test_groupby_cumprod(): tm.assert_series_equal(actual, expected) df = DataFrame({"key": ["b"] * 100, "value": 2}) + df["value"] = df["value"].astype(float) actual = df.groupby("key")["value"].cumprod() # if overflows, groupby product casts to float # while numpy passes back invalid values - df["value"] = df["value"].astype(float) expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) expected.name = "value" tm.assert_series_equal(actual, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ca356d94d84f8..abf690a1c8ed0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2846,14 +2846,14 @@ def test_single_element_list_grouping(): values, _ = next(iter(df.groupby(["a"]))) -@pytest.mark.parametrize("func", ["sum", "cumsum"]) +@pytest.mark.parametrize("func", ["sum", "cumsum", "cumprod"]) def test_groupby_sum_avoid_casting_to_float(func): # GH#37493 val = 922337203685477580 df = DataFrame({"a": 1, "b": [val]}) result = getattr(df.groupby("a"), func)() - val expected = DataFrame({"b": [0]}, index=Index([1], name="a")) - if func == "cumsum": + if func in ["cumsum", "cumprod"]: expected = expected.reset_index(drop=True) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py index 24abbd0f65795..4cea6d10af9e0 100644 --- a/pandas/tests/groupby/test_libgroupby.py +++ b/pandas/tests/groupby/test_libgroupby.py @@ -3,7 +3,7 @@ from pandas._libs import groupby as libgroupby from pandas._libs.groupby import ( - group_cumprod_float64, + group_cumprod, group_cumsum, group_mean, group_var, @@ -194,7 +194,7 @@ def test_cython_group_transform_cumsum(np_dtype): def test_cython_group_transform_cumprod(): # see gh-4095 dtype = np.float64 - pd_op, np_op = group_cumprod_float64, np.cumproduct + pd_op, np_op = group_cumprod, np.cumproduct _check_cython_group_transform_cumulative(pd_op, np_op, dtype) @@ -209,7 +209,7 @@ def test_cython_group_transform_algos(): data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") actual = np.zeros_like(data) actual.fill(np.nan) - group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike) + group_cumprod(actual, data, labels, ngroups, is_datetimelike) expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") tm.assert_numpy_array_equal(actual[:, 0], expected) From af0c539c2b2dcd0113c2fa161bc109ad5b47b6f1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 18 Aug 2022 23:11:50 +0200 Subject: [PATCH 2/7] Add whatsnew --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6707d4985d06c..cbd8febd50c8a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -906,6 +906,7 @@ Performance improvements - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) - Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`) - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`) +- Performance improvement in :meth:`.GroupBy.cumprod` for extension array dtypes (:issue:`37493`) - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing` (:issue:`47458`) - Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`) - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) From e931b93c3a41f8f5e7f6ea4c2bd397f6bf7497b2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 30 Aug 2022 09:58:41 +0200 Subject: [PATCH 3/7] Move whatsnew --- doc/source/whatsnew/v1.5.0.rst | 1 - doc/source/whatsnew/v1.6.0.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 5a3607c979bca..711352775400e 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -960,7 +960,6 @@ Performance improvements - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) - Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`) - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`) -- Performance improvement in :meth:`.GroupBy.cumprod` for extension array dtypes (:issue:`37493`) - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing` (:issue:`47458`) - Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`) - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index eac5e5d3a0f52..e3dc67d5577bb 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -100,7 +100,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Performance improvement in :meth:`.GroupBy.cumprod` for extension array dtypes (:issue:`37493`) - .. --------------------------------------------------------------------------- From 781c6781575113248ba02f63717828df9d440262 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Sep 2022 00:10:04 +0200 Subject: [PATCH 4/7] Adress review --- doc/source/whatsnew/v1.6.0.rst | 28 +++++++++++++++++++++++++-- pandas/tests/groupby/test_function.py | 19 ++++++++++++++++-- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index e24d3a3b4800b..beaa6eda45c49 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -41,8 +41,32 @@ These are bug fixes that might have notable behavior changes. .. _whatsnew_160.notable_bug_fixes.notable_bug_fix1: -notable_bug_fix1 -^^^^^^^^^^^^^^^^ +:meth:`.GroupBy.cumsum` and :meth:`.GroupBy.cumprod` overflow instead of lossy casting to float +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions we cast to float when applying ``cumsum`` and ``cumprod`` which +lead to incorrect results even if the result could be hold by ``int64`` dtype. +Additionally, the aggregation overflows consistent with numpy when the limit of +``int64`` is reached. + +*Old Behavior* + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({"key": ["b"] * 7, "value": 625}) + In [4]: df.groupby("key")["value"].cumprod()[5] + Out[5]: 5.960464477539062e+16 + +We return incorrect results with the 6th value. + +*New Behavior* + +.. ipython:: python + + df = pd.DataFrame({"key": ["b"] * 6, "value": 625}) + df.groupby("key")["value"].cumprod() + +We overflow with the 7th value but the 6th value is still correct. .. _whatsnew_160.notable_bug_fixes.notable_bug_fix2: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 93aa44af51dd0..ed2d9f4140e68 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -643,13 +643,28 @@ def test_groupby_cumprod(): df = DataFrame({"key": ["b"] * 100, "value": 2}) df["value"] = df["value"].astype(float) actual = df.groupby("key")["value"].cumprod() - # if overflows, groupby product casts to float - # while numpy passes back invalid values expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) expected.name = "value" tm.assert_series_equal(actual, expected) +def test_groupby_cumprod_overflow(): + # GH#37493 if we overflow we return garbage consistent with numpy + df = DataFrame({"key": ["b"] * 4, "value": 100_000}) + actual = df.groupby("key")["value"].cumprod() + expected = Series( + [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920], + name="value", + ) + tm.assert_series_equal(actual, expected) + + numpy_result = df.groupby("key", group_keys=False)["value"].apply( + lambda x: x.cumprod() + ) + numpy_result.name = "value" + tm.assert_series_equal(actual, numpy_result) + + def test_groupby_cumprod_nan_influences_other_columns(): # GH#48064 df = DataFrame( From 2476651135c00b5cc460b65374922309e0217a5e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Sep 2022 00:18:50 +0200 Subject: [PATCH 5/7] Fix example --- doc/source/whatsnew/v1.6.0.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index beaa6eda45c49..93f7ec1118567 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -53,9 +53,9 @@ Additionally, the aggregation overflows consistent with numpy when the limit of .. code-block:: ipython - In [3]: df = pd.DataFrame({"key": ["b"] * 7, "value": 625}) - In [4]: df.groupby("key")["value"].cumprod()[5] - Out[5]: 5.960464477539062e+16 + In [1]: df = pd.DataFrame({"key": ["b"] * 7, "value": 625}) + In [2]: df.groupby("key")["value"].cumprod()[5] + Out[2]: 5.960464477539062e+16 We return incorrect results with the 6th value. @@ -63,10 +63,10 @@ We return incorrect results with the 6th value. .. ipython:: python - df = pd.DataFrame({"key": ["b"] * 6, "value": 625}) + df = pd.DataFrame({"key": ["b"] * 7, "value": 625}) df.groupby("key")["value"].cumprod() -We overflow with the 7th value but the 6th value is still correct. +We overflow with the 7th value, but the 6th value is still correct. .. _whatsnew_160.notable_bug_fixes.notable_bug_fix2: From fdfbf22f289ab657a39e8c25e4a5ea1d76e50517 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 12 Sep 2022 21:36:16 +0200 Subject: [PATCH 6/7] Clarify --- doc/source/whatsnew/v1.6.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 7951e9899238d..90bf7e4a77fdd 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -48,7 +48,8 @@ These are bug fixes that might have notable behavior changes. In previous versions we cast to float when applying ``cumsum`` and ``cumprod`` which lead to incorrect results even if the result could be hold by ``int64`` dtype. -Additionally, the aggregation overflows consistent with numpy when the limit of +Additionally, the aggregation overflows consistent with numpy and the regular +:meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods when the limit of ``int64`` is reached. *Old Behavior* From c6fc53ccd5e4c1c5be66377a1b9e827829839638 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 15 Sep 2022 22:47:38 +0200 Subject: [PATCH 7/7] Change dtype access --- doc/source/whatsnew/v1.6.0.rst | 2 +- pandas/_libs/groupby.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 90bf7e4a77fdd..2f10d83b8d058 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -50,7 +50,7 @@ In previous versions we cast to float when applying ``cumsum`` and ``cumprod`` w lead to incorrect results even if the result could be hold by ``int64`` dtype. Additionally, the aggregation overflows consistent with numpy and the regular :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods when the limit of -``int64`` is reached. +``int64`` is reached (:issue:`37493`). *Old Behavior* diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f2aa0a0d1c114..f798655e9d922 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -274,7 +274,7 @@ def group_cumprod( bint uses_mask = mask is not None N, K = (values).shape - accum = np.ones((ngroups, K), dtype=np.asarray(values).dtype) + accum = np.ones((ngroups, K), dtype=(values).dtype) na_val = _get_na_val(0, is_datetimelike) accum_mask = np.zeros((ngroups, K), dtype="uint8")