diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 7b451ed3bf296..bc21fcf61b2db 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -32,6 +32,7 @@ objects. .. autosummary:: :toctree: api/ + api.extensions.ExtensionArray._accumulate api.extensions.ExtensionArray._concat_same_type api.extensions.ExtensionArray._formatter api.extensions.ExtensionArray._from_factorized diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9f3ccb3e14116..590c23d955453 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -342,6 +342,7 @@ Other enhancements - ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`) - :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for ``yerr`` and/or ``xerr``, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) + .. --------------------------------------------------------------------------- .. _whatsnew_110.notable_bug_fixes: diff --git a/pandas/conftest.py b/pandas/conftest.py index ce572e42abec6..c5df29829634e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1005,6 +1005,17 @@ def all_logical_operators(request): return request.param +_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"] + + +@pytest.fixture(params=_all_numeric_accumulations) +def all_numeric_accumulations(request): + """ + Fixture for numeric accumulation names + """ + return request.param + + # ---------------------------------------------------------------- # Data sets/files # ---------------------------------------------------------------- diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py new file mode 100644 index 0000000000000..f2e0bb0ba3ff6 --- /dev/null +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -0,0 +1,69 @@ +from typing import Callable + +import numpy as np + +from pandas._libs import iNaT + +from pandas.core.dtypes.missing import isna + +""" +datetimelke_accumulations.py is for accumulations of datetimelike extension arrays +""" + + +def _cum_func( + func: Callable, + values: np.ndarray, + *, + skipna: bool = True, +): + """ + Accumulations for 1D datetimelike arrays. + + Parameters + ---------- + func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + skipna : bool, default True + Whether to skip NA. + """ + try: + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: np.iinfo(np.int64).min, + np.cumsum: 0, + np.minimum.accumulate: np.iinfo(np.int64).max, + }[func] + except KeyError: + raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") + + mask = isna(values) + y = values.view("i8") + y[mask] = fill_value + + if not skipna: + # This is different compared to the recent implementation for datetimelikes + # but is the same as the implementation for masked arrays + mask = np.maximum.accumulate(mask) + + result = func(y) + result[mask] = iNaT + return result + + +def cumsum(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumsum, values, skipna=skipna) + + +def cumprod(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumprod, values, skipna=skipna) + + +def cummin(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.minimum.accumulate, values, skipna=skipna) + + +def cummax(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.maximum.accumulate, values, skipna=skipna) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py new file mode 100644 index 0000000000000..fee283227df57 --- /dev/null +++ b/pandas/core/array_algos/masked_accumulations.py @@ -0,0 +1,78 @@ +from typing import Callable + +import numpy as np + +from pandas.core.dtypes.common import ( + is_float_dtype, + is_integer_dtype, +) + +""" +masked_accumulations.py is for accumulation algorithms using a mask-based approach +for missing values. +""" + + +def _cum_func( + func: Callable, + values: np.ndarray, + mask: np.ndarray, + *, + skipna: bool = True, +): + """ + Accumulations for 1D masked array. + + Parameters + ---------- + func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + """ + dtype_info = None + if is_float_dtype(values): + dtype_info = np.finfo(values.dtype.type) + elif is_integer_dtype(values): + dtype_info = np.iinfo(values.dtype.type) + else: + raise NotImplementedError( + f"No masked accumulation defined for dtype {values.dtype.type}" + ) + try: + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: dtype_info.min, + np.cumsum: 0, + np.minimum.accumulate: dtype_info.max, + }[func] + except KeyError: + raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") + + values[mask] = fill_value + + if not skipna: + mask = np.maximum.accumulate(mask) + + values = func(values) + return values, mask + + +def cumsum(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumsum, values, mask, skipna=skipna) + + +def cumprod(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumprod, values, mask, skipna=skipna) + + +def cummin(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna) + + +def cummax(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index edc8fa14ca142..09d6a030040f5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -108,6 +108,7 @@ class ExtensionArray: take unique view + _accumulate _concat_same_type _formatter _from_factorized @@ -157,8 +158,9 @@ class ExtensionArray: as they only compose abstract methods. Still, a more efficient implementation may be available, and these methods can be overridden. - One can implement methods to handle array reductions. + One can implement methods to handle array accumulations or reductions. + * _accumulate * _reduce One can implement methods to handle parsing from strings that will be used @@ -1253,6 +1255,37 @@ def _concat_same_type( # of objects _can_hold_na = True + def _accumulate( + self: ExtensionArray, name: str, *, skipna=True, **kwargs + ) -> ExtensionArray: + """ + Return an ExtensionArray performing an accumulation operation. + The underlying data type might change + + Parameters + ---------- + name : str + Name of the function, supported values are: + - cummin + - cummax + - cumsum + - cumprod + skipna : bool, default True + If True, skip NA values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, there is no supported kwarg. + + Returns + ------- + array + + Raises + ------ + NotImplementedError : subclass does not define accumulations + """ + raise NotImplementedError(f"cannot perform {name} with type {self.dtype}") + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ Return a scalar result of performing the reduction operation. diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 260cd08707473..f8a628d347229 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -712,6 +712,15 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return super()._reduce(name, skipna=skipna, **kwargs) + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> BaseMaskedArray: + from pandas.core.arrays import IntegerArray + + data = self._data.astype(int) + mask = self._mask + return IntegerArray(data, mask)._accumulate(name, skipna=skipna, **kwargs) + def _maybe_mask_result(self, result, mask, other, op_name: str): """ Parameters diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 5dd55ff0f1fa2..230bdf8cdd1ea 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -93,6 +93,7 @@ isin, unique1d, ) +from pandas.core.array_algos import datetimelike_accumulations from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import ( NDArrayBackedExtensionArray, @@ -1204,6 +1205,22 @@ def _time_shift(self, periods, freq=None): # to be passed explicitly. return self._generate_range(start=start, end=end, periods=None, freq=self.freq) + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> DatetimeLikeArrayT: + + data = self._data.copy() + + if name in {"cummin", "cummax"}: + op = getattr(datetimelike_accumulations, name) + data = op(data, skipna=skipna, **kwargs) + + return type(self)._simple_new(data, freq=self.freq, dtype=self.dtype) + + raise NotImplementedError( + f"Accumlation {name} not implemented for {type(self)}" + ) + @unpack_zerodim_and_defer("__add__") def __add__(self, other): other_dtype = getattr(other, "dtype", None) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index bae14f4e560c2..59fd406407655 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -49,7 +49,10 @@ isin, take, ) -from pandas.core.array_algos import masked_reductions +from pandas.core.array_algos import ( + masked_accumulations, + masked_reductions, +) from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray from pandas.core.indexers import check_array_indexer @@ -457,3 +460,19 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return libmissing.NA return result + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> BaseMaskedArray: + data = self._data + mask = self._mask + + if name in {"cumsum", "cumprod", "cummin", "cummax"}: + op = getattr(masked_accumulations, name) + data, mask = op(data, mask, skipna=skipna, **kwargs) + + return type(self)(data, mask, copy=False) + + raise NotImplementedError( + "Accumlation {name} not implemented for BaseMaskedArray" + ) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 893644be23a0e..457e4de1ce015 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -57,6 +57,7 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr +from pandas.core.array_algos import datetimelike_accumulations from pandas.core.arrays import ( IntegerArray, datetimelike as dtl, @@ -403,6 +404,24 @@ def std( return self._box_func(result) return self._from_backing_data(result) + # ---------------------------------------------------------------- + # Accumulations + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> TimedeltaArray: + + data = self._data.copy() + + if name in {"cumsum", "cumsum"}: + op = getattr(datetimelike_accumulations, name) + data = op(data, skipna=skipna, **kwargs) + + return type(self)._simple_new(data, freq=None, dtype=self.dtype) + + else: + return super()._accumulate(name, skipna=skipna, **kwargs) + # ---------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a32ae7090ef8b..5c97263401259 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10320,7 +10320,14 @@ def _accum_func(self, name: str, func, axis=None, skipna=True, *args, **kwargs): def block_accum_func(blk_values): values = blk_values.T if hasattr(blk_values, "T") else blk_values - result = nanops.na_accum_func(values, func, skipna=skipna) + from pandas.core.construction import ensure_wrapped_if_datetimelike + + values = ensure_wrapped_if_datetimelike(values) + + if isinstance(values, ExtensionArray): + result = values._accumulate(name, skipna=skipna, **kwargs) + else: + result = nanops.na_accum_func(values, func, skipna=skipna) result = result.T if hasattr(result, "T") else result return result diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 9cf3bdab40d0b..00d641a8e1895 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -41,6 +41,10 @@ class TestMyDtype(BaseDtypeTests): ``assert_series_equal`` on your base test class. """ +from pandas.tests.extension.base.accumulate import ( # noqa + BaseNoAccumulateTests, + BaseNumericAccumulateTests, +) from pandas.tests.extension.base.casting import BaseCastingTests # noqa from pandas.tests.extension.base.constructors import BaseConstructorsTests # noqa from pandas.tests.extension.base.dtype import BaseDtypeTests # noqa diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py new file mode 100644 index 0000000000000..882f96572791c --- /dev/null +++ b/pandas/tests/extension/base/accumulate.py @@ -0,0 +1,36 @@ +import pytest + +import pandas as pd +from pandas.tests.extension.base.base import BaseExtensionTests + + +class BaseAccumulateTests(BaseExtensionTests): + """ + Accumulation specific tests. Generally these only + make sense for numeric/boolean operations. + """ + + def check_accumulate(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + self.assert_series_equal(result, expected, check_dtype=False) + + +class BaseNoAccumulateTests(BaseAccumulateTests): + """ we don't define any accumulations """ + + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna): + op_name = all_numeric_accumulations + s = pd.Series(data) + + with pytest.raises(NotImplementedError): + getattr(s, op_name)(skipna=skipna) + + +class BaseNumericAccumulateTests(BaseAccumulateTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series(self, data, all_numeric_accumulations, skipna): + op_name = all_numeric_accumulations + s = pd.Series(data) + self.check_accumulate(s, op_name, skipna) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 3ef3beaa9c1b1..862aa5cd7dde5 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -393,5 +393,15 @@ class TestUnaryOps(base.BaseUnaryOpsTests): pass +class TestNumericAccumulation(base.BaseNumericAccumulateTests): + def check_accumulate(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) + tm.assert_series_equal(result, expected, check_dtype=False) + + +# TODO parsing not yet supported +# class TestParsing(base.BaseParsingTests): +# pass class TestParsing(base.BaseParsingTests): pass diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 3f1f2c02c79f7..404f8132e5f1d 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -151,6 +151,10 @@ class TestReduce(base.BaseNoReduceTests): pass +class TestAccumulate(base.BaseNoAccumulateTests): + pass + + class TestMethods(base.BaseMethodsTests): @pytest.mark.skip(reason="Unobserved categories included") def test_value_counts(self, all_data, dropna): diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 617dfc694741e..213e37e015975 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -221,3 +221,7 @@ class TestPrinting(base.BasePrintingTests): class TestParsing(base.BaseParsingTests): pass + + +class TestNumericAccumulation(base.BaseNumericAccumulateTests): + pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 2305edc1e1327..50b57be6dd275 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -247,6 +247,54 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): pass +class TestNumericAccumulation(base.BaseNumericAccumulateTests): + def check_accumulate(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) + tm.assert_series_equal(result, expected, check_dtype=False) + # # overwrite to ensure pd.NA is tested instead of np.nan + # # https://github.com/pandas-dev/pandas/issues/30958 + # if op_name == "cumsum": + # if s.dtype.name.startswith("U"): + # expected_dtype = "uint64" + # else: + # expected_dtype = "int64" + # result = getattr(s, op_name)(skipna=skipna) + # expected = pd.Series( + # integer_array( + # getattr(s.astype("float64"), op_name)(skipna=skipna), + # dtype=expected_dtype, + # ) + # ) + # tm.assert_series_equal(result, expected) + # elif op_name in ["cummax", "cummin"]: + # expected_dtype = s.dtype + # result = getattr(s, op_name)(skipna=skipna) + # expected = pd.Series( + # integer_array( + # getattr(s.astype("float64"), op_name)(skipna=skipna), + # dtype=expected_dtype, + # ) + # ) + # tm.assert_series_equal(result, expected) + # elif op_name == "cumprod": + # if s.dtype.name.startswith("U"): + # expected_dtype = "uint64" + # else: + # expected_dtype = "int64" + # result = getattr(s[:20], op_name)(skipna=skipna) + # expected = pd.Series( + # integer_array( + # getattr(s[:20].astype("float64"), op_name)(skipna=skipna), + # dtype=expected_dtype, + # ) + # ) + # tm.assert_series_equal(result, expected) + + # else: + # raise + + class TestPrinting(base.BasePrintingTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 067fada5edcae..45b34ed0d300d 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -464,3 +464,7 @@ def test_EA_types(self, engine, data): expected_msg = r".*must implement _from_sequence_of_strings.*" with pytest.raises(NotImplementedError, match=expected_msg): super().test_EA_types(engine, data) + + +class TestNoNumericAccumulations(base.BaseNoAccumulateTests): + pass