ENH: Add cumulative methods to ea (pandas-dev#48111)

phofl · datajanko · mroeschke · web-flow · commit b5953aa9b25a · 2022-12-12T17:43:51.000-08:00
* define accumulation interface for ExtensionArrays * reformulate doc string * creates baseExtension tests for accumulate * adds fixtures for numeric_accumulations * fixes typos * adds accumulate tests for integer arrays * fixes typo * first implementation of cumsum * stashed merge conflict * fixes formatting * first green test for integer extension arrays and cumsum * first passing tests for cummin and cummax * utilizes na_accum_func * removes delegation leftover * creates running tests * removes ABCExtensionArray Type hint * removes clutter from generic.py * removes clutter in _accumulate * adds typehints for ExtensionArray and IntegerArray * delegates the accumulate calls to extension arrays * removes diff in nanops * removes unwanted pattern * makes output types for sum and prod explicit * makes the base accumulate test more general by not comparing types * implements accumulation for boolean arrays * uses f-string in base.py * uses blockmanager also for extension arrays * fixes flake8 issues * removes uncommented code * adds todo for runtime warning * reuses integer array to accumulate for booleans * removes runtimewarning catching * removes TODOs * adds accumulate to autosummary * excludes datetime from propagating to _accumulate * uses pandas.testing instead of pandas.util.testing in accumulate * replaces assert_almost_equal with assert_series_equal * dtypes to lowercase * lowercase of uint and int64 dtype in _accumulate * uses hint of @simonjayhawkins concerning assert series equals * adds whatsnew entry * moves changes to 1.2.0 * uses na_accum_func * delegate to EAs _accumulate function in block mgr * moves implementation from nanops to masked_accumulations * fixes typing annotations in base and masked * fixes merge error * fills na values without nanops * fixes incorrect call to cumsum and changes to cumprod * add _accumulate to boolean * makes tests a lot easier - cumprod tests still fail * adds BaseNumericAccumulation for floating masked array * tests no numeric accumulations according to _accumulate interface * uses NotImplementedError in base accumulate function * ensures the fill values are data independent additionally, remove min_count as irrellevant * adds accumulation for datetimelikes in generic.py ensure that datetimelikes are wrapped create a twin of masked_accumulations for datetimelikes timedeltas also allow cumsum and cumprod, theoretically * actually ads datetimelike accumulation algos * fixes absolute imports * changes error to catch to adhere to changed implementation * Remove blank line in old whatsnew * Remove merge error * Fix additional merge errors * Refactor datetimelike accum funcs * Remove unnecessary import * Refactor tests * Skip test * Fix mypy * Fix dtype creation * Fix cumprod tests * Fix docstring * Adress review * Adress review * Update pandas/core/arrays/base.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/tests/extension/test_integer.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Add comment * Clarify comment * Fix pre commit * Add whatsnew * Move to top of file * Change error * Change _data * Remove * Add todo * Fix typo * Adjust var * Special case * Fix tests * Combine classes * Fix mypy Co-authored-by: Jan Koch <Jan.Koch@tu-dortmund.de> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -32,6 +32,7 @@ objects.
    .. autosummary::
       :toctree: api/
 
+      api.extensions.ExtensionArray._accumulate
       api.extensions.ExtensionArray._concat_same_type
       api.extensions.ExtensionArray._formatter
       api.extensions.ExtensionArray._from_factorized
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -76,6 +76,7 @@ Other enhancements
 - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
 - Added support for extension array dtypes in :func:`merge` (:issue:`44240`)
 - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)
+- Added ``cumsum``, ``cumprod``, ``cummin`` and ``cummax`` to the ``ExtensionArray`` interface via ``_accumulate`` (:issue:`28385`)
 - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)
 - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`)
 - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`)
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1123,6 +1123,17 @@ def all_logical_operators(request):
     return request.param
 
 
+_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"]
+
+
+@pytest.fixture(params=_all_numeric_accumulations)
+def all_numeric_accumulations(request):
+    """
+    Fixture for numeric accumulation names
+    """
+    return request.param
+
+
 # ----------------------------------------------------------------
 # Data sets/files
 # ----------------------------------------------------------------
diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py
@@ -0,0 +1,92 @@
+"""
+masked_accumulations.py is for accumulation algorithms using a mask-based approach
+for missing values.
+"""
+
+from __future__ import annotations
+
+from typing import Callable
+
+import numpy as np
+
+from pandas._typing import npt
+
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+)
+
+
+def _cum_func(
+    func: Callable,
+    values: np.ndarray,
+    mask: npt.NDArray[np.bool_],
+    *,
+    skipna: bool = True,
+):
+    """
+    Accumulations for 1D masked array.
+
+    We will modify values in place to replace NAs with the appropriate fill value.
+
+    Parameters
+    ----------
+    func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate
+    values : np.ndarray
+        Numpy array with the values (can be of any dtype that support the
+        operation).
+    mask : np.ndarray
+        Boolean numpy array (True values indicate missing values).
+    skipna : bool, default True
+        Whether to skip NA.
+    """
+    dtype_info: np.iinfo | np.finfo
+    if is_float_dtype(values):
+        dtype_info = np.finfo(values.dtype.type)
+    elif is_integer_dtype(values):
+        dtype_info = np.iinfo(values.dtype.type)
+    elif is_bool_dtype(values):
+        # Max value of bool is 1, but since we are setting into a boolean
+        # array, 255 is fine as well. Min value has to be 0 when setting
+        # into the boolean array.
+        dtype_info = np.iinfo(np.uint8)
+    else:
+        raise NotImplementedError(
+            f"No masked accumulation defined for dtype {values.dtype.type}"
+        )
+    try:
+        fill_value = {
+            np.cumprod: 1,
+            np.maximum.accumulate: dtype_info.min,
+            np.cumsum: 0,
+            np.minimum.accumulate: dtype_info.max,
+        }[func]
+    except KeyError:
+        raise NotImplementedError(
+            f"No accumulation for {func} implemented on BaseMaskedArray"
+        )
+
+    values[mask] = fill_value
+
+    if not skipna:
+        mask = np.maximum.accumulate(mask)
+
+    values = func(values)
+    return values, mask
+
+
+def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
+    return _cum_func(np.cumsum, values, mask, skipna=skipna)
+
+
+def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
+    return _cum_func(np.cumprod, values, mask, skipna=skipna)
+
+
+def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
+    return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna)
+
+
+def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
+    return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -133,6 +133,7 @@ class ExtensionArray:
     tolist
     unique
     view
+    _accumulate
     _concat_same_type
     _formatter
     _from_factorized
@@ -182,8 +183,9 @@ class ExtensionArray:
     as they only compose abstract methods. Still, a more efficient
     implementation may be available, and these methods can be overridden.
 
-    One can implement methods to handle array reductions.
+    One can implement methods to handle array accumulations or reductions.
 
+    * _accumulate
     * _reduce
 
     One can implement methods to handle parsing from strings that will be used
@@ -1368,6 +1370,38 @@ def _concat_same_type(
     def _can_hold_na(self) -> bool:
         return self.dtype._can_hold_na
 
+    def _accumulate(
+        self, name: str, *, skipna: bool = True, **kwargs
+    ) -> ExtensionArray:
+        """
+        Return an ExtensionArray performing an accumulation operation.
+
+        The underlying data type might change.
+
+        Parameters
+        ----------
+        name : str
+            Name of the function, supported values are:
+            - cummin
+            - cummax
+            - cumsum
+            - cumprod
+        skipna : bool, default True
+            If True, skip NA values.
+        **kwargs
+            Additional keyword arguments passed to the accumulation function.
+            Currently, there is no supported kwarg.
+
+        Returns
+        -------
+        array
+
+        Raises
+        ------
+        NotImplementedError : subclass does not define accumulations
+        """
+        raise NotImplementedError(f"cannot perform {name} with type {self.dtype}")
+
     def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
         """
         Return a scalar result of performing the reduction operation.
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -26,6 +26,7 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import ops
+from pandas.core.array_algos import masked_accumulations
 from pandas.core.arrays.masked import (
     BaseMaskedArray,
     BaseMaskedDtype,
@@ -378,3 +379,19 @@ def _logical_method(self, other, op):
 
         # i.e. BooleanArray
         return self._maybe_mask_result(result, mask)
+
+    def _accumulate(
+        self, name: str, *, skipna: bool = True, **kwargs
+    ) -> BaseMaskedArray:
+        data = self._data
+        mask = self._mask
+        if name in ("cummin", "cummax"):
+            op = getattr(masked_accumulations, name)
+            data, mask = op(data, mask, skipna=skipna, **kwargs)
+            return type(self)(data, mask, copy=False)
+        else:
+            from pandas.core.arrays import IntegerArray
+
+            return IntegerArray(data.astype(int), mask)._accumulate(
+                name, skipna=skipna, **kwargs
+            )
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1352,6 +1352,27 @@ def _addsub_object_array(self, other: np.ndarray, op):
         result = result.reshape(self.shape)
         return result
 
+    def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):
+
+        if is_period_dtype(self.dtype):
+            data = self
+        else:
+            # Incompatible types in assignment (expression has type
+            # "ndarray[Any, Any]", variable has type "DatetimeLikeArrayMixin"
+            data = self._ndarray.copy()  # type: ignore[assignment]
+
+        if name in {"cummin", "cummax"}:
+            func = np.minimum.accumulate if name == "cummin" else np.maximum.accumulate
+            result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna))
+
+            # error: Unexpected keyword argument "freq" for
+            # "_simple_new" of "NDArrayBacked"  [call-arg]
+            return type(self)._simple_new(
+                result, freq=self.freq, dtype=self.dtype  # type: ignore[call-arg]
+            )
+
+        raise TypeError(f"Accumulation {name} not supported for {type(self)}")
+
     @unpack_zerodim_and_defer("__add__")
     def __add__(self, other):
         other_dtype = getattr(other, "dtype", None)
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -73,7 +73,10 @@
     isin,
     take,
 )
-from pandas.core.array_algos import masked_reductions
+from pandas.core.array_algos import (
+    masked_accumulations,
+    masked_reductions,
+)
 from pandas.core.array_algos.quantile import quantile_with_mask
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays import ExtensionArray
@@ -1328,3 +1331,14 @@ def all(self, *, skipna: bool = True, **kwargs):
                 return result
             else:
                 return self.dtype.na_value
+
+    def _accumulate(
+        self, name: str, *, skipna: bool = True, **kwargs
+    ) -> BaseMaskedArray:
+        data = self._data
+        mask = self._mask
+
+        op = getattr(masked_accumulations, name)
+        data, mask = op(data, mask, skipna=skipna, **kwargs)
+
+        return type(self)(data, mask, copy=False)
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -410,6 +410,23 @@ def std(
             return self._box_func(result)
         return self._from_backing_data(result)
 
+    # ----------------------------------------------------------------
+    # Accumulations
+
+    def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):
+
+        data = self._ndarray.copy()
+
+        if name in {"cumsum", "cumprod"}:
+            # TODO: cumprod should not work here GH#48111
+            func = np.cumsum if name == "cumsum" else np.cumprod
+            result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna))
+
+            return type(self)._simple_new(result, freq=None, dtype=self.dtype)
+
+        else:
+            return super()._accumulate(name, skipna=skipna, **kwargs)
+
     # ----------------------------------------------------------------
     # Rendering Methods
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -10828,7 +10828,11 @@ def _accum_func(
         def block_accum_func(blk_values):
             values = blk_values.T if hasattr(blk_values, "T") else blk_values
 
-            result = nanops.na_accum_func(values, func, skipna=skipna)
+            result: np.ndarray | ExtensionArray
+            if isinstance(values, ExtensionArray):
+                result = values._accumulate(name, skipna=skipna, **kwargs)
+            else:
+                result = nanops.na_accum_func(values, func, skipna=skipna)
 
             result = result.T if hasattr(result, "T") else result
             return result
diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py
@@ -41,6 +41,7 @@ class TestMyDtype(BaseDtypeTests):
 ``assert_series_equal`` on your base test class.
 
 """
+from pandas.tests.extension.base.accumulate import BaseAccumulateTests  # noqa
 from pandas.tests.extension.base.casting import BaseCastingTests  # noqa
 from pandas.tests.extension.base.constructors import BaseConstructorsTests  # noqa
 from pandas.tests.extension.base.dim2 import (  # noqa
diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py
@@ -0,0 +1,37 @@
+import pytest
+
+import pandas as pd
+from pandas.tests.extension.base.base import BaseExtensionTests
+
+
+class BaseAccumulateTests(BaseExtensionTests):
+    """
+    Accumulation specific tests. Generally these only
+    make sense for numeric/boolean operations.
+    """
+
+    def check_accumulate(self, s, op_name, skipna):
+        result = getattr(s, op_name)(skipna=skipna)
+
+        if result.dtype == pd.Float32Dtype() and op_name == "cumprod" and skipna:
+            pytest.skip(
+                f"Float32 precision lead to large differences with op {op_name} "
+                f"and skipna={skipna}"
+            )
+
+        expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
+        self.assert_series_equal(result, expected, check_dtype=False)
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna):
+        op_name = all_numeric_accumulations
+        ser = pd.Series(data)
+
+        with pytest.raises(NotImplementedError):
+            getattr(ser, op_name)(skipna=skipna)
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_accumulate_series(self, data, all_numeric_accumulations, skipna):
+        op_name = all_numeric_accumulations
+        ser = pd.Series(data)
+        self.check_accumulate(ser, op_name, skipna)
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
@@ -16,6 +16,8 @@
 import numpy as np
 import pytest
 
+from pandas.core.dtypes.common import is_bool_dtype
+
 import pandas as pd
 import pandas._testing as tm
 from pandas.core.arrays.boolean import BooleanDtype
@@ -393,6 +395,19 @@ class TestUnaryOps(base.BaseUnaryOpsTests):
     pass
 
 
+class TestAccumulation(base.BaseAccumulateTests):
+    def check_accumulate(self, s, op_name, skipna):
+        result = getattr(s, op_name)(skipna=skipna)
+        expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna)
+        tm.assert_series_equal(result, expected, check_dtype=False)
+        if op_name in ("cummin", "cummax"):
+            assert is_bool_dtype(result)
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna):
+        pass
+
+
 class TestParsing(base.BaseParsingTests):
     pass
 
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
@@ -156,6 +156,12 @@ class TestReduce(base.BaseNoReduceTests):
     pass
 
 
+class TestAccumulate(base.BaseAccumulateTests):
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_accumulate_series(self, data, all_numeric_accumulations, skipna):
+        pass
+
+
 class TestMethods(base.BaseMethodsTests):
     @pytest.mark.xfail(reason="Unobserved categories included")
     def test_value_counts(self, all_data, dropna):
diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py
diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py