diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index ce8d8d5c2ca10..595b415ff7342 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -32,6 +32,7 @@ objects. .. autosummary:: :toctree: api/ + api.extensions.ExtensionArray._accumulate api.extensions.ExtensionArray._concat_same_type api.extensions.ExtensionArray._formatter api.extensions.ExtensionArray._from_factorized diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e16ef0857685d..fdaf46e4ca480 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -76,6 +76,7 @@ Other enhancements - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) - Added support for extension array dtypes in :func:`merge` (:issue:`44240`) - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`) +- Added ``cumsum``, ``cumprod``, ``cummin`` and ``cummax`` to the ``ExtensionArray`` interface via ``_accumulate`` (:issue:`28385`) - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`) - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`) - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`) diff --git a/pandas/conftest.py b/pandas/conftest.py index 05877ddf6e223..0d6af91d32dea 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1123,6 +1123,17 @@ def all_logical_operators(request): return request.param +_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"] + + +@pytest.fixture(params=_all_numeric_accumulations) +def all_numeric_accumulations(request): + """ + Fixture for numeric accumulation names + """ + return request.param + + # ---------------------------------------------------------------- # Data sets/files # ---------------------------------------------------------------- diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py new file mode 100644 index 0000000000000..07113128e0947 --- /dev/null +++ b/pandas/core/array_algos/masked_accumulations.py @@ -0,0 +1,92 @@ +""" +masked_accumulations.py is for accumulation algorithms using a mask-based approach +for missing values. +""" + +from __future__ import annotations + +from typing import Callable + +import numpy as np + +from pandas._typing import npt + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_float_dtype, + is_integer_dtype, +) + + +def _cum_func( + func: Callable, + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, +): + """ + Accumulations for 1D masked array. + + We will modify values in place to replace NAs with the appropriate fill value. + + Parameters + ---------- + func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + """ + dtype_info: np.iinfo | np.finfo + if is_float_dtype(values): + dtype_info = np.finfo(values.dtype.type) + elif is_integer_dtype(values): + dtype_info = np.iinfo(values.dtype.type) + elif is_bool_dtype(values): + # Max value of bool is 1, but since we are setting into a boolean + # array, 255 is fine as well. Min value has to be 0 when setting + # into the boolean array. + dtype_info = np.iinfo(np.uint8) + else: + raise NotImplementedError( + f"No masked accumulation defined for dtype {values.dtype.type}" + ) + try: + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: dtype_info.min, + np.cumsum: 0, + np.minimum.accumulate: dtype_info.max, + }[func] + except KeyError: + raise NotImplementedError( + f"No accumulation for {func} implemented on BaseMaskedArray" + ) + + values[mask] = fill_value + + if not skipna: + mask = np.maximum.accumulate(mask) + + values = func(values) + return values, mask + + +def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): + return _cum_func(np.cumsum, values, mask, skipna=skipna) + + +def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): + return _cum_func(np.cumprod, values, mask, skipna=skipna) + + +def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): + return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna) + + +def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): + return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f11d031b2f622..c36728391ba21 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -133,6 +133,7 @@ class ExtensionArray: tolist unique view + _accumulate _concat_same_type _formatter _from_factorized @@ -182,8 +183,9 @@ class ExtensionArray: as they only compose abstract methods. Still, a more efficient implementation may be available, and these methods can be overridden. - One can implement methods to handle array reductions. + One can implement methods to handle array accumulations or reductions. + * _accumulate * _reduce One can implement methods to handle parsing from strings that will be used @@ -1368,6 +1370,38 @@ def _concat_same_type( def _can_hold_na(self) -> bool: return self.dtype._can_hold_na + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ExtensionArray: + """ + Return an ExtensionArray performing an accumulation operation. + + The underlying data type might change. + + Parameters + ---------- + name : str + Name of the function, supported values are: + - cummin + - cummax + - cumsum + - cumprod + skipna : bool, default True + If True, skip NA values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, there is no supported kwarg. + + Returns + ------- + array + + Raises + ------ + NotImplementedError : subclass does not define accumulations + """ + raise NotImplementedError(f"cannot perform {name} with type {self.dtype}") + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ Return a scalar result of performing the reduction operation. diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 35b9de3f7af93..8ac665b1b2e11 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -26,6 +26,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ops +from pandas.core.array_algos import masked_accumulations from pandas.core.arrays.masked import ( BaseMaskedArray, BaseMaskedDtype, @@ -378,3 +379,19 @@ def _logical_method(self, other, op): # i.e. BooleanArray return self._maybe_mask_result(result, mask) + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> BaseMaskedArray: + data = self._data + mask = self._mask + if name in ("cummin", "cummax"): + op = getattr(masked_accumulations, name) + data, mask = op(data, mask, skipna=skipna, **kwargs) + return type(self)(data, mask, copy=False) + else: + from pandas.core.arrays import IntegerArray + + return IntegerArray(data.astype(int), mask)._accumulate( + name, skipna=skipna, **kwargs + ) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f9ff702a608a4..69b7d5a220d24 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1352,6 +1352,27 @@ def _addsub_object_array(self, other: np.ndarray, op): result = result.reshape(self.shape) return result + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): + + if is_period_dtype(self.dtype): + data = self + else: + # Incompatible types in assignment (expression has type + # "ndarray[Any, Any]", variable has type "DatetimeLikeArrayMixin" + data = self._ndarray.copy() # type: ignore[assignment] + + if name in {"cummin", "cummax"}: + func = np.minimum.accumulate if name == "cummin" else np.maximum.accumulate + result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna)) + + # error: Unexpected keyword argument "freq" for + # "_simple_new" of "NDArrayBacked" [call-arg] + return type(self)._simple_new( + result, freq=self.freq, dtype=self.dtype # type: ignore[call-arg] + ) + + raise TypeError(f"Accumulation {name} not supported for {type(self)}") + @unpack_zerodim_and_defer("__add__") def __add__(self, other): other_dtype = getattr(other, "dtype", None) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 3aa6a12160b73..3071016bb3bda 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -73,7 +73,10 @@ isin, take, ) -from pandas.core.array_algos import masked_reductions +from pandas.core.array_algos import ( + masked_accumulations, + masked_reductions, +) from pandas.core.array_algos.quantile import quantile_with_mask from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray @@ -1328,3 +1331,14 @@ def all(self, *, skipna: bool = True, **kwargs): return result else: return self.dtype.na_value + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> BaseMaskedArray: + data = self._data + mask = self._mask + + op = getattr(masked_accumulations, name) + data, mask = op(data, mask, skipna=skipna, **kwargs) + + return type(self)(data, mask, copy=False) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 334e5437c2f70..e68b56c57c176 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -410,6 +410,23 @@ def std( return self._box_func(result) return self._from_backing_data(result) + # ---------------------------------------------------------------- + # Accumulations + + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): + + data = self._ndarray.copy() + + if name in {"cumsum", "cumprod"}: + # TODO: cumprod should not work here GH#48111 + func = np.cumsum if name == "cumsum" else np.cumprod + result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna)) + + return type(self)._simple_new(result, freq=None, dtype=self.dtype) + + else: + return super()._accumulate(name, skipna=skipna, **kwargs) + # ---------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c5a931fe29ab1..46330ca22ccdb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10828,7 +10828,11 @@ def _accum_func( def block_accum_func(blk_values): values = blk_values.T if hasattr(blk_values, "T") else blk_values - result = nanops.na_accum_func(values, func, skipna=skipna) + result: np.ndarray | ExtensionArray + if isinstance(values, ExtensionArray): + result = values._accumulate(name, skipna=skipna, **kwargs) + else: + result = nanops.na_accum_func(values, func, skipna=skipna) result = result.T if hasattr(result, "T") else result return result diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 571ab3dca1efc..7e765cc5342d1 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -41,6 +41,7 @@ class TestMyDtype(BaseDtypeTests): ``assert_series_equal`` on your base test class. """ +from pandas.tests.extension.base.accumulate import BaseAccumulateTests # noqa from pandas.tests.extension.base.casting import BaseCastingTests # noqa from pandas.tests.extension.base.constructors import BaseConstructorsTests # noqa from pandas.tests.extension.base.dim2 import ( # noqa diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py new file mode 100644 index 0000000000000..868172f930844 --- /dev/null +++ b/pandas/tests/extension/base/accumulate.py @@ -0,0 +1,37 @@ +import pytest + +import pandas as pd +from pandas.tests.extension.base.base import BaseExtensionTests + + +class BaseAccumulateTests(BaseExtensionTests): + """ + Accumulation specific tests. Generally these only + make sense for numeric/boolean operations. + """ + + def check_accumulate(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + + if result.dtype == pd.Float32Dtype() and op_name == "cumprod" and skipna: + pytest.skip( + f"Float32 precision lead to large differences with op {op_name} " + f"and skipna={skipna}" + ) + + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + self.assert_series_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna): + op_name = all_numeric_accumulations + ser = pd.Series(data) + + with pytest.raises(NotImplementedError): + getattr(ser, op_name)(skipna=skipna) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series(self, data, all_numeric_accumulations, skipna): + op_name = all_numeric_accumulations + ser = pd.Series(data) + self.check_accumulate(ser, op_name, skipna) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 9646ade43e1d7..b611701e4e429 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -16,6 +16,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_bool_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays.boolean import BooleanDtype @@ -393,6 +395,19 @@ class TestUnaryOps(base.BaseUnaryOpsTests): pass +class TestAccumulation(base.BaseAccumulateTests): + def check_accumulate(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) + tm.assert_series_equal(result, expected, check_dtype=False) + if op_name in ("cummin", "cummax"): + assert is_bool_dtype(result) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna): + pass + + class TestParsing(base.BaseParsingTests): pass diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 1e17bf33c806c..9a363c6a0f022 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -156,6 +156,12 @@ class TestReduce(base.BaseNoReduceTests): pass +class TestAccumulate(base.BaseAccumulateTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series(self, data, all_numeric_accumulations, skipna): + pass + + class TestMethods(base.BaseMethodsTests): @pytest.mark.xfail(reason="Unobserved categories included") def test_value_counts(self, all_data, dropna): diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 580ab743a9d93..f67f7dc56d26f 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -217,3 +217,9 @@ class TestParsing(base.BaseParsingTests): @pytest.mark.filterwarnings("ignore:overflow encountered in reduce:RuntimeWarning") class Test2DCompat(base.Dim2CompatTests): pass + + +class TestAccumulation(base.BaseAccumulateTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna): + pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index ba6daf4f2e189..788a0bf46afc5 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -16,6 +16,11 @@ import numpy as np import pytest +from pandas.compat import ( + IS64, + is_platform_windows, +) + import pandas as pd import pandas._testing as tm from pandas.api.types import ( @@ -227,6 +232,56 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): pass +class TestAccumulation(base.BaseAccumulateTests): + def check_accumulate(self, s, op_name, skipna): + # overwrite to ensure pd.NA is tested instead of np.nan + # https://github.com/pandas-dev/pandas/issues/30958 + length = 64 + if not IS64 or is_platform_windows(): + if not s.dtype.itemsize == 8: + length = 32 + + if s.dtype.name.startswith("U"): + expected_dtype = f"UInt{length}" + else: + expected_dtype = f"Int{length}" + + if op_name == "cumsum": + result = getattr(s, op_name)(skipna=skipna) + expected = pd.Series( + pd.array( + getattr(s.astype("float64"), op_name)(skipna=skipna), + dtype=expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + elif op_name in ["cummax", "cummin"]: + result = getattr(s, op_name)(skipna=skipna) + expected = pd.Series( + pd.array( + getattr(s.astype("float64"), op_name)(skipna=skipna), + dtype=s.dtype, + ) + ) + tm.assert_series_equal(result, expected) + elif op_name == "cumprod": + result = getattr(s[:12], op_name)(skipna=skipna) + expected = pd.Series( + pd.array( + getattr(s[:12].astype("float64"), op_name)(skipna=skipna), + dtype=expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + + else: + raise NotImplementedError(f"{op_name} not supported") + + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna): + pass + + class TestPrinting(base.BasePrintingTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index f82d3c6c06fca..022e5cb764e14 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -482,3 +482,9 @@ def test_EA_types(self, engine, data): with pytest.raises(NotImplementedError, match=expected_msg): with tm.assert_produces_warning(FutureWarning, match="astype from"): super().test_EA_types(engine, data) + + +class TestNoNumericAccumulations(base.BaseAccumulateTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series(self, data, all_numeric_accumulations, skipna): + pass