Skip to content

Commit b5953aa

Browse files
phofldatajankomroeschke
authored
ENH: Add cumulative methods to ea (pandas-dev#48111)
* define accumulation interface for ExtensionArrays * reformulate doc string * creates baseExtension tests for accumulate * adds fixtures for numeric_accumulations * fixes typos * adds accumulate tests for integer arrays * fixes typo * first implementation of cumsum * stashed merge conflict * fixes formatting * first green test for integer extension arrays and cumsum * first passing tests for cummin and cummax * utilizes na_accum_func * removes delegation leftover * creates running tests * removes ABCExtensionArray Type hint * removes clutter from generic.py * removes clutter in _accumulate * adds typehints for ExtensionArray and IntegerArray * delegates the accumulate calls to extension arrays * removes diff in nanops * removes unwanted pattern * makes output types for sum and prod explicit * makes the base accumulate test more general by not comparing types * implements accumulation for boolean arrays * uses f-string in base.py * uses blockmanager also for extension arrays * fixes flake8 issues * removes uncommented code * adds todo for runtime warning * reuses integer array to accumulate for booleans * removes runtimewarning catching * removes TODOs * adds accumulate to autosummary * excludes datetime from propagating to _accumulate * uses pandas.testing instead of pandas.util.testing in accumulate * replaces assert_almost_equal with assert_series_equal * dtypes to lowercase * lowercase of uint and int64 dtype in _accumulate * uses hint of @simonjayhawkins concerning assert series equals * adds whatsnew entry * moves changes to 1.2.0 * uses na_accum_func * delegate to EAs _accumulate function in block mgr * moves implementation from nanops to masked_accumulations * fixes typing annotations in base and masked * fixes merge error * fills na values without nanops * fixes incorrect call to cumsum and changes to cumprod * add _accumulate to boolean * makes tests a lot easier - cumprod tests still fail * adds BaseNumericAccumulation for floating masked array * tests no numeric accumulations according to _accumulate interface * uses NotImplementedError in base accumulate function * ensures the fill values are data independent additionally, remove min_count as irrellevant * adds accumulation for datetimelikes in generic.py ensure that datetimelikes are wrapped create a twin of masked_accumulations for datetimelikes timedeltas also allow cumsum and cumprod, theoretically * actually ads datetimelike accumulation algos * fixes absolute imports * changes error to catch to adhere to changed implementation * Remove blank line in old whatsnew * Remove merge error * Fix additional merge errors * Refactor datetimelike accum funcs * Remove unnecessary import * Refactor tests * Skip test * Fix mypy * Fix dtype creation * Fix cumprod tests * Fix docstring * Adress review * Adress review * Update pandas/core/arrays/base.py Co-authored-by: Matthew Roeschke <[email protected]> * Update pandas/tests/extension/test_integer.py Co-authored-by: Matthew Roeschke <[email protected]> * Add comment * Clarify comment * Fix pre commit * Add whatsnew * Move to top of file * Change error * Change _data * Remove * Add todo * Fix typo * Adjust var * Special case * Fix tests * Combine classes * Fix mypy Co-authored-by: Jan Koch <[email protected]> Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 5a372d8 commit b5953aa

17 files changed

+341
-3
lines changed

doc/source/reference/extensions.rst

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ objects.
3232
.. autosummary::
3333
:toctree: api/
3434
35+
api.extensions.ExtensionArray._accumulate
3536
api.extensions.ExtensionArray._concat_same_type
3637
api.extensions.ExtensionArray._formatter
3738
api.extensions.ExtensionArray._from_factorized

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ Other enhancements
7676
- Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
7777
- Added support for extension array dtypes in :func:`merge` (:issue:`44240`)
7878
- Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)
79+
- Added ``cumsum``, ``cumprod``, ``cummin`` and ``cummax`` to the ``ExtensionArray`` interface via ``_accumulate`` (:issue:`28385`)
7980
- :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)
8081
- Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`)
8182
- :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`)

pandas/conftest.py

+11
Original file line numberDiff line numberDiff line change
@@ -1123,6 +1123,17 @@ def all_logical_operators(request):
11231123
return request.param
11241124

11251125

1126+
_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"]
1127+
1128+
1129+
@pytest.fixture(params=_all_numeric_accumulations)
1130+
def all_numeric_accumulations(request):
1131+
"""
1132+
Fixture for numeric accumulation names
1133+
"""
1134+
return request.param
1135+
1136+
11261137
# ----------------------------------------------------------------
11271138
# Data sets/files
11281139
# ----------------------------------------------------------------
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
masked_accumulations.py is for accumulation algorithms using a mask-based approach
3+
for missing values.
4+
"""
5+
6+
from __future__ import annotations
7+
8+
from typing import Callable
9+
10+
import numpy as np
11+
12+
from pandas._typing import npt
13+
14+
from pandas.core.dtypes.common import (
15+
is_bool_dtype,
16+
is_float_dtype,
17+
is_integer_dtype,
18+
)
19+
20+
21+
def _cum_func(
22+
func: Callable,
23+
values: np.ndarray,
24+
mask: npt.NDArray[np.bool_],
25+
*,
26+
skipna: bool = True,
27+
):
28+
"""
29+
Accumulations for 1D masked array.
30+
31+
We will modify values in place to replace NAs with the appropriate fill value.
32+
33+
Parameters
34+
----------
35+
func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate
36+
values : np.ndarray
37+
Numpy array with the values (can be of any dtype that support the
38+
operation).
39+
mask : np.ndarray
40+
Boolean numpy array (True values indicate missing values).
41+
skipna : bool, default True
42+
Whether to skip NA.
43+
"""
44+
dtype_info: np.iinfo | np.finfo
45+
if is_float_dtype(values):
46+
dtype_info = np.finfo(values.dtype.type)
47+
elif is_integer_dtype(values):
48+
dtype_info = np.iinfo(values.dtype.type)
49+
elif is_bool_dtype(values):
50+
# Max value of bool is 1, but since we are setting into a boolean
51+
# array, 255 is fine as well. Min value has to be 0 when setting
52+
# into the boolean array.
53+
dtype_info = np.iinfo(np.uint8)
54+
else:
55+
raise NotImplementedError(
56+
f"No masked accumulation defined for dtype {values.dtype.type}"
57+
)
58+
try:
59+
fill_value = {
60+
np.cumprod: 1,
61+
np.maximum.accumulate: dtype_info.min,
62+
np.cumsum: 0,
63+
np.minimum.accumulate: dtype_info.max,
64+
}[func]
65+
except KeyError:
66+
raise NotImplementedError(
67+
f"No accumulation for {func} implemented on BaseMaskedArray"
68+
)
69+
70+
values[mask] = fill_value
71+
72+
if not skipna:
73+
mask = np.maximum.accumulate(mask)
74+
75+
values = func(values)
76+
return values, mask
77+
78+
79+
def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
80+
return _cum_func(np.cumsum, values, mask, skipna=skipna)
81+
82+
83+
def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
84+
return _cum_func(np.cumprod, values, mask, skipna=skipna)
85+
86+
87+
def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
88+
return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna)
89+
90+
91+
def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
92+
return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna)

pandas/core/arrays/base.py

+35-1
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ class ExtensionArray:
133133
tolist
134134
unique
135135
view
136+
_accumulate
136137
_concat_same_type
137138
_formatter
138139
_from_factorized
@@ -182,8 +183,9 @@ class ExtensionArray:
182183
as they only compose abstract methods. Still, a more efficient
183184
implementation may be available, and these methods can be overridden.
184185
185-
One can implement methods to handle array reductions.
186+
One can implement methods to handle array accumulations or reductions.
186187
188+
* _accumulate
187189
* _reduce
188190
189191
One can implement methods to handle parsing from strings that will be used
@@ -1368,6 +1370,38 @@ def _concat_same_type(
13681370
def _can_hold_na(self) -> bool:
13691371
return self.dtype._can_hold_na
13701372

1373+
def _accumulate(
1374+
self, name: str, *, skipna: bool = True, **kwargs
1375+
) -> ExtensionArray:
1376+
"""
1377+
Return an ExtensionArray performing an accumulation operation.
1378+
1379+
The underlying data type might change.
1380+
1381+
Parameters
1382+
----------
1383+
name : str
1384+
Name of the function, supported values are:
1385+
- cummin
1386+
- cummax
1387+
- cumsum
1388+
- cumprod
1389+
skipna : bool, default True
1390+
If True, skip NA values.
1391+
**kwargs
1392+
Additional keyword arguments passed to the accumulation function.
1393+
Currently, there is no supported kwarg.
1394+
1395+
Returns
1396+
-------
1397+
array
1398+
1399+
Raises
1400+
------
1401+
NotImplementedError : subclass does not define accumulations
1402+
"""
1403+
raise NotImplementedError(f"cannot perform {name} with type {self.dtype}")
1404+
13711405
def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
13721406
"""
13731407
Return a scalar result of performing the reduction operation.

pandas/core/arrays/boolean.py

+17
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from pandas.core.dtypes.missing import isna
2727

2828
from pandas.core import ops
29+
from pandas.core.array_algos import masked_accumulations
2930
from pandas.core.arrays.masked import (
3031
BaseMaskedArray,
3132
BaseMaskedDtype,
@@ -378,3 +379,19 @@ def _logical_method(self, other, op):
378379

379380
# i.e. BooleanArray
380381
return self._maybe_mask_result(result, mask)
382+
383+
def _accumulate(
384+
self, name: str, *, skipna: bool = True, **kwargs
385+
) -> BaseMaskedArray:
386+
data = self._data
387+
mask = self._mask
388+
if name in ("cummin", "cummax"):
389+
op = getattr(masked_accumulations, name)
390+
data, mask = op(data, mask, skipna=skipna, **kwargs)
391+
return type(self)(data, mask, copy=False)
392+
else:
393+
from pandas.core.arrays import IntegerArray
394+
395+
return IntegerArray(data.astype(int), mask)._accumulate(
396+
name, skipna=skipna, **kwargs
397+
)

pandas/core/arrays/datetimelike.py

+21
Original file line numberDiff line numberDiff line change
@@ -1352,6 +1352,27 @@ def _addsub_object_array(self, other: np.ndarray, op):
13521352
result = result.reshape(self.shape)
13531353
return result
13541354

1355+
def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):
1356+
1357+
if is_period_dtype(self.dtype):
1358+
data = self
1359+
else:
1360+
# Incompatible types in assignment (expression has type
1361+
# "ndarray[Any, Any]", variable has type "DatetimeLikeArrayMixin"
1362+
data = self._ndarray.copy() # type: ignore[assignment]
1363+
1364+
if name in {"cummin", "cummax"}:
1365+
func = np.minimum.accumulate if name == "cummin" else np.maximum.accumulate
1366+
result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna))
1367+
1368+
# error: Unexpected keyword argument "freq" for
1369+
# "_simple_new" of "NDArrayBacked" [call-arg]
1370+
return type(self)._simple_new(
1371+
result, freq=self.freq, dtype=self.dtype # type: ignore[call-arg]
1372+
)
1373+
1374+
raise TypeError(f"Accumulation {name} not supported for {type(self)}")
1375+
13551376
@unpack_zerodim_and_defer("__add__")
13561377
def __add__(self, other):
13571378
other_dtype = getattr(other, "dtype", None)

pandas/core/arrays/masked.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,10 @@
7373
isin,
7474
take,
7575
)
76-
from pandas.core.array_algos import masked_reductions
76+
from pandas.core.array_algos import (
77+
masked_accumulations,
78+
masked_reductions,
79+
)
7780
from pandas.core.array_algos.quantile import quantile_with_mask
7881
from pandas.core.arraylike import OpsMixin
7982
from pandas.core.arrays import ExtensionArray
@@ -1328,3 +1331,14 @@ def all(self, *, skipna: bool = True, **kwargs):
13281331
return result
13291332
else:
13301333
return self.dtype.na_value
1334+
1335+
def _accumulate(
1336+
self, name: str, *, skipna: bool = True, **kwargs
1337+
) -> BaseMaskedArray:
1338+
data = self._data
1339+
mask = self._mask
1340+
1341+
op = getattr(masked_accumulations, name)
1342+
data, mask = op(data, mask, skipna=skipna, **kwargs)
1343+
1344+
return type(self)(data, mask, copy=False)

pandas/core/arrays/timedeltas.py

+17
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,23 @@ def std(
410410
return self._box_func(result)
411411
return self._from_backing_data(result)
412412

413+
# ----------------------------------------------------------------
414+
# Accumulations
415+
416+
def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):
417+
418+
data = self._ndarray.copy()
419+
420+
if name in {"cumsum", "cumprod"}:
421+
# TODO: cumprod should not work here GH#48111
422+
func = np.cumsum if name == "cumsum" else np.cumprod
423+
result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna))
424+
425+
return type(self)._simple_new(result, freq=None, dtype=self.dtype)
426+
427+
else:
428+
return super()._accumulate(name, skipna=skipna, **kwargs)
429+
413430
# ----------------------------------------------------------------
414431
# Rendering Methods
415432

pandas/core/generic.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -10828,7 +10828,11 @@ def _accum_func(
1082810828
def block_accum_func(blk_values):
1082910829
values = blk_values.T if hasattr(blk_values, "T") else blk_values
1083010830

10831-
result = nanops.na_accum_func(values, func, skipna=skipna)
10831+
result: np.ndarray | ExtensionArray
10832+
if isinstance(values, ExtensionArray):
10833+
result = values._accumulate(name, skipna=skipna, **kwargs)
10834+
else:
10835+
result = nanops.na_accum_func(values, func, skipna=skipna)
1083210836

1083310837
result = result.T if hasattr(result, "T") else result
1083410838
return result

pandas/tests/extension/base/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class TestMyDtype(BaseDtypeTests):
4141
``assert_series_equal`` on your base test class.
4242
4343
"""
44+
from pandas.tests.extension.base.accumulate import BaseAccumulateTests # noqa
4445
from pandas.tests.extension.base.casting import BaseCastingTests # noqa
4546
from pandas.tests.extension.base.constructors import BaseConstructorsTests # noqa
4647
from pandas.tests.extension.base.dim2 import ( # noqa
+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import pytest
2+
3+
import pandas as pd
4+
from pandas.tests.extension.base.base import BaseExtensionTests
5+
6+
7+
class BaseAccumulateTests(BaseExtensionTests):
8+
"""
9+
Accumulation specific tests. Generally these only
10+
make sense for numeric/boolean operations.
11+
"""
12+
13+
def check_accumulate(self, s, op_name, skipna):
14+
result = getattr(s, op_name)(skipna=skipna)
15+
16+
if result.dtype == pd.Float32Dtype() and op_name == "cumprod" and skipna:
17+
pytest.skip(
18+
f"Float32 precision lead to large differences with op {op_name} "
19+
f"and skipna={skipna}"
20+
)
21+
22+
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
23+
self.assert_series_equal(result, expected, check_dtype=False)
24+
25+
@pytest.mark.parametrize("skipna", [True, False])
26+
def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna):
27+
op_name = all_numeric_accumulations
28+
ser = pd.Series(data)
29+
30+
with pytest.raises(NotImplementedError):
31+
getattr(ser, op_name)(skipna=skipna)
32+
33+
@pytest.mark.parametrize("skipna", [True, False])
34+
def test_accumulate_series(self, data, all_numeric_accumulations, skipna):
35+
op_name = all_numeric_accumulations
36+
ser = pd.Series(data)
37+
self.check_accumulate(ser, op_name, skipna)

pandas/tests/extension/test_boolean.py

+15
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import numpy as np
1717
import pytest
1818

19+
from pandas.core.dtypes.common import is_bool_dtype
20+
1921
import pandas as pd
2022
import pandas._testing as tm
2123
from pandas.core.arrays.boolean import BooleanDtype
@@ -393,6 +395,19 @@ class TestUnaryOps(base.BaseUnaryOpsTests):
393395
pass
394396

395397

398+
class TestAccumulation(base.BaseAccumulateTests):
399+
def check_accumulate(self, s, op_name, skipna):
400+
result = getattr(s, op_name)(skipna=skipna)
401+
expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna)
402+
tm.assert_series_equal(result, expected, check_dtype=False)
403+
if op_name in ("cummin", "cummax"):
404+
assert is_bool_dtype(result)
405+
406+
@pytest.mark.parametrize("skipna", [True, False])
407+
def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna):
408+
pass
409+
410+
396411
class TestParsing(base.BaseParsingTests):
397412
pass
398413

pandas/tests/extension/test_categorical.py

+6
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,12 @@ class TestReduce(base.BaseNoReduceTests):
156156
pass
157157

158158

159+
class TestAccumulate(base.BaseAccumulateTests):
160+
@pytest.mark.parametrize("skipna", [True, False])
161+
def test_accumulate_series(self, data, all_numeric_accumulations, skipna):
162+
pass
163+
164+
159165
class TestMethods(base.BaseMethodsTests):
160166
@pytest.mark.xfail(reason="Unobserved categories included")
161167
def test_value_counts(self, all_data, dropna):

0 commit comments

Comments
 (0)