pandas-dev · mroeschke · Dec 13, 2022 · Sep 13, 2019 · Sep 14, 2019 · Sep 15, 2019
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -32,6 +32,7 @@ objects.
    .. autosummary::
       :toctree: api/
 
+      api.extensions.ExtensionArray._accumulate
       api.extensions.ExtensionArray._concat_same_type
       api.extensions.ExtensionArray._formatter
       api.extensions.ExtensionArray._from_factorized

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -58,6 +58,7 @@ Other enhancements
 - Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`)
 - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
 - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)
+- Added ``cumsum``, ``cumprod``, ``cummin`` and ``cummax`` to the ``ExtensionArray`` interface via ``_accumulate`` (:issue:`28385`)
 - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)
 - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`)
 - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`)

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1123,6 +1123,17 @@ def all_logical_operators(request):
     return request.param
 
 
+_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"]
+
+
+@pytest.fixture(params=_all_numeric_accumulations)
+def all_numeric_accumulations(request):
+    """
+    Fixture for numeric accumulation names
+    """
+    return request.param
+
+
 # ----------------------------------------------------------------
 # Data sets/files
 # ----------------------------------------------------------------

diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py
@@ -0,0 +1,92 @@
+"""
+masked_accumulations.py is for accumulation algorithms using a mask-based approach
+for missing values.
+"""
+
+from __future__ import annotations
+
+from typing import Callable
+
+import numpy as np
+
+from pandas._typing import npt
+
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+)
+
+
+def _cum_func(
+    func: Callable,
+    values: np.ndarray,
+    mask: npt.NDArray[np.bool_],
+    *,
+    skipna: bool = True,
+):
+    """
+    Accumulations for 1D masked array.
+
+    We will modify values in place to replace NAs with the appropriate fill value.
+
+    Parameters
+    ----------
+    func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate
+    values : np.ndarray
+        Numpy array with the values (can be of any dtype that support the
+        operation).
+    mask : np.ndarray
+        Boolean numpy array (True values indicate missing values).
+    skipna : bool, default True
+        Whether to skip NA.
+    """
+    dtype_info: np.iinfo | np.finfo
+    if is_float_dtype(values):
+        dtype_info = np.finfo(values.dtype.type)
+    elif is_integer_dtype(values):
+        dtype_info = np.iinfo(values.dtype.type)
+    elif is_bool_dtype(values):
+        # Max value of bool is 1, but since we are setting into a boolean
+        # array, 255 is fine as well. Min value has to be 0 when setting
+        # into the boolean array.
+        dtype_info = np.iinfo(np.uint8)
+    else:
+        raise NotImplementedError(
+            f"No masked accumulation defined for dtype {values.dtype.type}"
+        )
+    try:
+        fill_value = {
+            np.cumprod: 1,
+            np.maximum.accumulate: dtype_info.min,
+            np.cumsum: 0,
+            np.minimum.accumulate: dtype_info.max,
+        }[func]
+    except KeyError:
+        raise NotImplementedError(
+            f"No accumulation for {func} implemented on BaseMaskedArray"
+        )
+
+    values[mask] = fill_value
+
+    if not skipna:
+        mask = np.maximum.accumulate(mask)
+
+    values = func(values)
+    return values, mask
+
+
+def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
+    return _cum_func(np.cumsum, values, mask, skipna=skipna)
+
+
+def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
+    return _cum_func(np.cumprod, values, mask, skipna=skipna)
+
+
+def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
+    return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna)
+
+
+def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
+    return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -133,6 +133,7 @@ class ExtensionArray:
     tolist
     unique
     view
+    _accumulate
     _concat_same_type
     _formatter
     _from_factorized
@@ -182,8 +183,9 @@ class ExtensionArray:
     as they only compose abstract methods. Still, a more efficient
     implementation may be available, and these methods can be overridden.
 
-    One can implement methods to handle array reductions.
+    One can implement methods to handle array accumulations or reductions.
 
+    * _accumulate
     * _reduce
 
     One can implement methods to handle parsing from strings that will be used
@@ -1368,6 +1370,38 @@ def _concat_same_type(
     def _can_hold_na(self) -> bool:
         return self.dtype._can_hold_na
 
+    def _accumulate(
+        self, name: str, *, skipna: bool = True, **kwargs
+    ) -> ExtensionArray:
+        """
+        Return an ExtensionArray performing an accumulation operation.
+
+        The underlying data type might change.
+
+        Parameters
+        ----------
+        name : str
+            Name of the function, supported values are:
+            - cummin
+            - cummax
+            - cumsum
+            - cumprod
+        skipna : bool, default True
+            If True, skip NA values.
+        **kwargs
+            Additional keyword arguments passed to the accumulation function.
+            Currently, there is no supported kwarg.
+
+        Returns
+        -------
+        array
+
+        Raises
+        ------
+        NotImplementedError : subclass does not define accumulations
+        """
+        raise NotImplementedError(f"cannot perform {name} with type {self.dtype}")
+
     def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
         """
         Return a scalar result of performing the reduction operation.

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -26,6 +26,7 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import ops
+from pandas.core.array_algos import masked_accumulations
 from pandas.core.arrays.masked import (
     BaseMaskedArray,
     BaseMaskedDtype,
@@ -378,3 +379,19 @@ def _logical_method(self, other, op):
 
         # i.e. BooleanArray
         return self._maybe_mask_result(result, mask)
+
+    def _accumulate(
+        self, name: str, *, skipna: bool = True, **kwargs
+    ) -> BaseMaskedArray:
+        data = self._data
+        mask = self._mask
+        if name in ("cummin", "cummax"):
+            op = getattr(masked_accumulations, name)
+            data, mask = op(data, mask, skipna=skipna, **kwargs)
+            return type(self)(data, mask, copy=False)
+        else:
+            from pandas.core.arrays import IntegerArray
+
+            return IntegerArray(data.astype(int), mask)._accumulate(
+                name, skipna=skipna, **kwargs
+            )
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1359,6 +1359,22 @@ def _addsub_object_array(self, other: np.ndarray, op):
         result = result.reshape(self.shape)
         return result
 
+    def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):
+
+        data = self._ndarray.copy()
+
+        if name in {"cummin", "cummax"}:
+            func = np.minimum.accumulate if name == "cummin" else np.maximum.accumulate
+            result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna))
+
+            # error: Unexpected keyword argument "freq" for
+            # "_simple_new" of "NDArrayBacked"  [call-arg]
+            return type(self)._simple_new(
+                result, freq=self.freq, dtype=self.dtype  # type: ignore[call-arg]
+            )
+
+        raise TypeError(f"Accumlation {name} not supported for {type(self)}")
+
     @unpack_zerodim_and_defer("__add__")
     def __add__(self, other):
         other_dtype = getattr(other, "dtype", None)

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -74,7 +74,10 @@
     isin,
     take,
 )
-from pandas.core.array_algos import masked_reductions
+from pandas.core.array_algos import (
+    masked_accumulations,
+    masked_reductions,
+)
 from pandas.core.array_algos.quantile import quantile_with_mask
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays import ExtensionArray
@@ -1335,3 +1338,14 @@ def all(self, *, skipna: bool = True, **kwargs):
                 return result
             else:
                 return self.dtype.na_value
+
+    def _accumulate(
+        self, name: str, *, skipna: bool = True, **kwargs
+    ) -> BaseMaskedArray:
+        data = self._data
+        mask = self._mask
+
+        op = getattr(masked_accumulations, name)
+        data, mask = op(data, mask, skipna=skipna, **kwargs)
+
+        return type(self)(data, mask, copy=False)
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -410,6 +410,22 @@ def std(
             return self._box_func(result)
         return self._from_backing_data(result)
 
+    # ----------------------------------------------------------------
+    # Accumulations
+
+    def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):
+
+        data = self._data.copy()
+
+        if name in {"cumsum", "cumprod"}:
+            func = np.cumsum if name == "cumsum" else np.cumprod
+            result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna))
+
+            return type(self)._simple_new(result, freq=None, dtype=self.dtype)
+
+        else:
+            return super()._accumulate(name, skipna=skipna, **kwargs)
+
     # ----------------------------------------------------------------
     # Rendering Methods
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -10859,7 +10859,11 @@ def _accum_func(
         def block_accum_func(blk_values):
             values = blk_values.T if hasattr(blk_values, "T") else blk_values
 
-            result = nanops.na_accum_func(values, func, skipna=skipna)
+            result: np.ndarray | ExtensionArray
+            if isinstance(values, ExtensionArray):
+                result = values._accumulate(name, skipna=skipna, **kwargs)
+            else:
+                result = nanops.na_accum_func(values, func, skipna=skipna)
 
             result = result.T if hasattr(result, "T") else result
             return result

diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py
@@ -41,6 +41,10 @@ class TestMyDtype(BaseDtypeTests):
 ``assert_series_equal`` on your base test class.
 
 """
+from pandas.tests.extension.base.accumulate import (  # noqa
+    BaseNoAccumulateTests,
+    BaseNumericAccumulateTests,
+)
 from pandas.tests.extension.base.casting import BaseCastingTests  # noqa
 from pandas.tests.extension.base.constructors import BaseConstructorsTests  # noqa
 from pandas.tests.extension.base.dim2 import (  # noqa

diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py
@@ -0,0 +1,43 @@
+import pytest
+
+import pandas as pd
+from pandas.tests.extension.base.base import BaseExtensionTests
+
+
+class BaseAccumulateTests(BaseExtensionTests):
+    """
+    Accumulation specific tests. Generally these only
+    make sense for numeric/boolean operations.
+    """
+
+    def check_accumulate(self, s, op_name, skipna):
+        result = getattr(s, op_name)(skipna=skipna)
+
+        if result.dtype == pd.Float32Dtype() and op_name == "cumprod" and skipna:
+            pytest.skip(
+                f"Float32 precision lead to large differences with op {op_name} "
+                f"and skipna={skipna}"
+            )
+
+        expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
+        self.assert_series_equal(result, expected, check_dtype=False)
+
+
+class BaseNoAccumulateTests(BaseAccumulateTests):
+    """we don't define any accumulations"""
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna):
+        op_name = all_numeric_accumulations
+        s = pd.Series(data)
+
+        with pytest.raises(NotImplementedError):
+            getattr(s, op_name)(skipna=skipna)
+
+
+class BaseNumericAccumulateTests(BaseAccumulateTests):
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_accumulate_series(self, data, all_numeric_accumulations, skipna):
+        op_name = all_numeric_accumulations
+        s = pd.Series(data)
+        self.check_accumulate(s, op_name, skipna)
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
@@ -16,6 +16,8 @@
 import numpy as np
 import pytest
 
+from pandas.core.dtypes.common import is_bool_dtype
+
 import pandas as pd
 import pandas._testing as tm
 from pandas.core.arrays.boolean import BooleanDtype
@@ -393,6 +395,15 @@ class TestUnaryOps(base.BaseUnaryOpsTests):
     pass
 
 
+class TestNumericAccumulation(base.BaseNumericAccumulateTests):
+    def check_accumulate(self, s, op_name, skipna):
+        result = getattr(s, op_name)(skipna=skipna)
+        expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna)
+        tm.assert_series_equal(result, expected, check_dtype=False)
+        if op_name in ("cummin", "cummax"):
+            assert is_bool_dtype(result)
+
+
 class TestParsing(base.BaseParsingTests):
     pass
 

diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
@@ -156,6 +156,10 @@ class TestReduce(base.BaseNoReduceTests):
     pass
 
 
+class TestAccumulate(base.BaseNoAccumulateTests):
+    pass
+
+
 class TestMethods(base.BaseMethodsTests):
     @pytest.mark.xfail(reason="Unobserved categories included")
     def test_value_counts(self, all_data, dropna):

diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py
@@ -217,3 +217,7 @@ class TestParsing(base.BaseParsingTests):
 @pytest.mark.filterwarnings("ignore:overflow encountered in reduce:RuntimeWarning")
 class Test2DCompat(base.Dim2CompatTests):
     pass
+
+
+class TestNumericAccumulation(base.BaseNumericAccumulateTests):
+    pass