pandas-dev · datajanko · Sep 13, 2019 · Sep 14, 2019 · Sep 15, 2019 · Sep 18, 2019
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -32,6 +32,7 @@ objects.
    .. autosummary::
       :toctree: api/
 
+      api.extensions.ExtensionArray._accumulate
       api.extensions.ExtensionArray._concat_same_type
       api.extensions.ExtensionArray._formatter
       api.extensions.ExtensionArray._from_factorized

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -342,6 +342,7 @@ Other enhancements
 - ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`)
 - :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for ``yerr`` and/or ``xerr``, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`)
 
+
 .. ---------------------------------------------------------------------------
 
 .. _whatsnew_110.notable_bug_fixes:

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -532,6 +532,7 @@ ExtensionArray
 - Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`284881`)
 - Fixed bug when applying a NumPy ufunc with multiple outputs to a :class:`pandas.arrays.IntegerArray` returning None (:issue:`36913`)
 - Fixed an inconsistency in :class:`PeriodArray`'s ``__init__`` signature to those of :class:`DatetimeArray` and :class:`TimedeltaArray` (:issue:`37289`)
+- Added :meth:``api.extensionExtensionArray._accumulate`` to the extension array interface. Implements this interface for :class: `IntegerArray` and :class: `BooleanArray` such that type coercion to `object` is avoided (:issue:`28385`)
 
 Other
 ^^^^^

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -860,6 +860,17 @@ def all_logical_operators(request):
     return request.param
 
 
+_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"]
+
+
+@pytest.fixture(params=_all_numeric_accumulations)
+def all_numeric_accumulations(request):
+    """
+    Fixture for numeric accumulation names
+    """
+    return request.param
+
+
 # ----------------------------------------------------------------
 # Data sets/files
 # ----------------------------------------------------------------

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -70,6 +70,7 @@ class ExtensionArray:
     take
     unique
     view
+    _accumulate
     _concat_same_type
     _formatter
     _from_factorized
@@ -119,8 +120,9 @@ class ExtensionArray:
     as they only compose abstract methods. Still, a more efficient
     implementation may be available, and these methods can be overridden.
 
-    One can implement methods to handle array reductions.
+    One can implement methods to handle array accumulations or reductions.
 
+    * _accumulate
     * _reduce
 
     One can implement methods to handle parsing from strings that will be used
@@ -1154,7 +1156,37 @@ def _concat_same_type(
     # of objects
     _can_hold_na = True
 
+    def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray":
+        """
+        Return an ExtensionArray performing an accumulation operation.
+        The underlying data type might change
+
+        Parameters
+        ----------
+        name : str
+            Name of the function, supported values are:
+            - cummin
+            - cummax
+            - cumsum
+            - cumprod
+        skipna : bool, default True
+            If True, skip NA values.
+        **kwargs
+            Additional keyword arguments passed to the accumulation function.
+            Currently, there is no supported kwarg.
+
+        Returns
+        -------
+        array
+
+        Raises
+        ------
+        TypeError : subclass does not define accumulations
+        """
+        raise TypeError(f"cannot perform {name} with type {self.dtype}")
+
     def _reduce(self, name: str, skipna: bool = True, **kwargs):
+
         """
         Return a scalar result of performing the reduction operation.
 

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -677,6 +677,13 @@ def _arith_method(self, other, op):
 
         return self._maybe_mask_result(result, mask, other, op_name)
 
+    def _accumulate(self, name: str, skipna: bool = True, **kwargs):
+        from pandas.arrays import IntegerArray
+
+        return IntegerArray(self._data.astype("int8"), self._mask)._accumulate(
+            name, skipna, **kwargs
+        )
+
     def _reduce(self, name: str, skipna: bool = True, **kwargs):
 
         if name in {"any", "all"}:

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -603,6 +603,21 @@ def _arith_method(self, other, op):
 
         return self._maybe_mask_result(result, mask, other, op_name)
 
+    def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray":
+        cum_function = {
+            "cumprod": np.cumprod,
+            "cummax": np.maximum.accumulate,
+            "cumsum": np.cumsum,
+            "cummin": np.minimum.accumulate,
+        }.get(name)
+        if not cum_function:
+            raise ValueError(f"{name} is not defined for IntegerArrays")
+
+        from pandas.core.nanops import na_accum_func
+
+        result = na_accum_func(self, cum_function, skipna=skipna)
+        return result
+
     def sum(self, skipna=True, min_count=0, **kwargs):
         nv.validate_sum((), kwargs)
         return super()._reduce("sum", skipna=skipna, min_count=min_count)

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -1690,6 +1690,28 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike:
             result = type(values)._simple_new(  # type: ignore[attr-defined]
                 result, dtype=orig_dtype
             )
+    from pandas.core.arrays import IntegerArray
+
+    if isinstance(values, IntegerArray):
+        data = values._data
+        mask = values._mask
+
+        fill_value = {
+            np.cumprod: 1,
+            np.maximum.accumulate: data.min(),
+            np.cumsum: 0,
+            np.minimum.accumulate: data.max(),
+        }[accum_func]
+
+        values, mask, dtype, dtype_max, fill_value = _get_values(
+            data, skipna=skipna, fill_value=fill_value, mask=mask
+        )
+
+        if not skipna:
+            mask = np.maximum.accumulate(mask)
+
+        vals = accum_func(values)
+        result = IntegerArray(vals, mask)
 
     elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)):
         vals = values.copy()

diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py
@@ -41,6 +41,7 @@ class TestMyDtype(BaseDtypeTests):
 ``assert_series_equal`` on your base test class.
 
 """
+from .accumulate import BaseNoAccumulateTests, BaseNumericAccumulateTests  # noqa
 from .casting import BaseCastingTests  # noqa
 from .constructors import BaseConstructorsTests  # noqa
 from .dtype import BaseDtypeTests  # noqa

diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py
@@ -0,0 +1,37 @@
+import pytest
+
+import pandas as pd
+
+from .base import BaseExtensionTests
+
+
+class BaseAccumulateTests(BaseExtensionTests):
+    """
+    Accumulation specific tests. Generally these only
+    make sense for numeric/boolean operations.
+    """
+
+    def check_accumulate(self, s, op_name, skipna):
+        result = getattr(s, op_name)(skipna=skipna)
+        expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
+        self.assert_series_equal(result, expected, check_dtype=False)
+
+
+class BaseNoAccumulateTests(BaseAccumulateTests):
+    """ we don't define any accumulations """
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna):
+        op_name = all_numeric_accumulations
+        s = pd.Series(data)
+
+        with pytest.raises(TypeError):
+            getattr(s, op_name)(skipna=skipna)
+
+
+class BaseNumericAccumulateTests(BaseAccumulateTests):
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_accumulate_series(self, data, all_numeric_accumulations, skipna):
+        op_name = all_numeric_accumulations
+        s = pd.Series(data)
+        self.check_accumulate(s, op_name, skipna)
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
@@ -385,5 +385,12 @@ class TestUnaryOps(base.BaseUnaryOpsTests):
     pass
 
 
+class TestNumericAccumulation(base.BaseNumericAccumulateTests):
+    pass
+
+
+# TODO parsing not yet supported
+# class TestParsing(base.BaseParsingTests):
+#     pass
 class TestParsing(base.BaseParsingTests):
     pass
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
@@ -124,6 +124,10 @@ class TestReduce(base.BaseNoReduceTests):
     pass
 
 
+class TestAccumulate(base.BaseNoAccumulateTests):
+    pass
+
+
 class TestMethods(base.BaseMethodsTests):
     @pytest.mark.skip(reason="Unobserved categories included")
     def test_value_counts(self, all_data, dropna):

diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py
@@ -249,6 +249,51 @@ class TestBooleanReduce(base.BaseBooleanReduceTests):
     pass
 
 
+class TestNumericAccumulation(base.BaseNumericAccumulateTests):
+    def check_accumulate(self, s, op_name, skipna):
+        # overwrite to ensure pd.NA is tested instead of np.nan
+        # https://github.com/pandas-dev/pandas/issues/30958
+        if op_name == "cumsum":
+            if s.dtype.name.startswith("U"):
+                expected_dtype = "uint64"
+            else:
+                expected_dtype = "int64"
+            result = getattr(s, op_name)(skipna=skipna)
+            expected = pd.Series(
+                integer_array(
+                    getattr(s.astype("float64"), op_name)(skipna=skipna),
+                    dtype=expected_dtype,
+                )
+            )
+            tm.assert_series_equal(result, expected)
+        elif op_name in ["cummax", "cummin"]:
+            expected_dtype = s.dtype
+            result = getattr(s, op_name)(skipna=skipna)
+            expected = pd.Series(
+                integer_array(
+                    getattr(s.astype("float64"), op_name)(skipna=skipna),
+                    dtype=expected_dtype,
+                )
+            )
+            tm.assert_series_equal(result, expected)
+        elif op_name == "cumprod":
+            if s.dtype.name.startswith("U"):
+                expected_dtype = "uint64"
+            else:
+                expected_dtype = "int64"
+            result = getattr(s[:20], op_name)(skipna=skipna)
+            expected = pd.Series(
+                integer_array(
+                    getattr(s[:20].astype("float64"), op_name)(skipna=skipna),
+                    dtype=expected_dtype,
+                )
+            )
+            tm.assert_series_equal(result, expected)
+
+        else:
+            raise
+
+
 class TestPrinting(base.BasePrintingTests):
     pass