-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
[WIP, ENH] Adds cumulative methods to ea #28509
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 67 commits
2897723
a54e1b4
10abc0f
c2d7592
2c149c0
79cea11
12a5ca3
dc959f4
bcfb8a8
9a8f4ec
5d837d9
9e9f0c3
a1a1cb2
6d967ad
73363bf
0d9a3d5
84a7d81
ce6869d
3b5d1d8
0337cb0
f0722f5
99baa1b
fa35b14
43fca7c
7bd6378
185510b
2ef9ebb
7d898bd
09b42be
af0dd24
bc9a36a
38454a3
5ecfa51
8d62594
5f3b624
06d1286
7efcb5f
ae5f969
f7e3f4f
aa99927
9cab6d9
b3ae864
52e6486
99fb664
d339250
be6f974
a902f4e
64afb5b
1e5d77b
c95b490
dc669de
08475a4
67fa99a
a36632b
b3d3c81
f8f6367
ad6773d
663c301
e17f3a0
8cb66f9
4f953cf
b33a5df
18ec178
305bdc7
56bfb23
63db854
9c91c55
6ba3ca9
f2a49b3
386fa39
55de384
6a5b7f8
9c63c64
84dd141
2f23499
a5b30e6
d22c8a0
a5866c7
8255457
483b608
150fd3b
80e2dc6
dceab99
1c14f18
e20501a
53147c4
597e978
5ebe8ea
32367c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,6 +70,7 @@ class ExtensionArray: | |
take | ||
unique | ||
view | ||
_accumulate | ||
_concat_same_type | ||
_formatter | ||
_from_factorized | ||
|
@@ -119,8 +120,9 @@ class ExtensionArray: | |
as they only compose abstract methods. Still, a more efficient | ||
implementation may be available, and these methods can be overridden. | ||
|
||
One can implement methods to handle array reductions. | ||
One can implement methods to handle array accumulations or reductions. | ||
|
||
* _accumulate | ||
* _reduce | ||
|
||
One can implement methods to handle parsing from strings that will be used | ||
|
@@ -1154,7 +1156,37 @@ def _concat_same_type( | |
# of objects | ||
_can_hold_na = True | ||
|
||
def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray": | ||
datajanko marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Return an ExtensionArray performing an accumulation operation. | ||
The underlying data type might change | ||
|
||
Parameters | ||
---------- | ||
name : str | ||
Name of the function, supported values are: | ||
- cummin | ||
- cummax | ||
- cumsum | ||
- cumprod | ||
skipna : bool, default True | ||
If True, skip NA values. | ||
**kwargs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May be for numpy compatibility (axis, dtype, out). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are your expectations adding this? Do you purely want to have it (for this ticket) as accessor, which we'll ignore? If not, I'd guess the impact of axis is None, but dtype might be interesting. I.e for cumsum and integer dtypes, we could also provide the target output type, so not defaulting to (U)Int64. But I don't know if this should be part of this issue There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea would be for something like |
||
Additional keyword arguments passed to the accumulation function. | ||
Currently, there is no supported kwarg. | ||
|
||
Returns | ||
------- | ||
array | ||
|
||
Raises | ||
------ | ||
TypeError : subclass does not define accumulations | ||
""" | ||
raise TypeError(f"cannot perform {name} with type {self.dtype}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NotImplementedError? |
||
|
||
def _reduce(self, name: str, skipna: bool = True, **kwargs): | ||
|
||
""" | ||
Return a scalar result of performing the reduction operation. | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1690,6 +1690,28 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: | |
result = type(values)._simple_new( # type: ignore[attr-defined] | ||
result, dtype=orig_dtype | ||
) | ||
from pandas.core.arrays import IntegerArray | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these should be done in NumericArray rather than here |
||
|
||
if isinstance(values, IntegerArray): | ||
data = values._data | ||
mask = values._mask | ||
|
||
fill_value = { | ||
np.cumprod: 1, | ||
np.maximum.accumulate: data.min(), | ||
np.cumsum: 0, | ||
np.minimum.accumulate: data.max(), | ||
}[accum_func] | ||
|
||
values, mask, dtype, dtype_max, fill_value = _get_values( | ||
data, skipna=skipna, fill_value=fill_value, mask=mask | ||
) | ||
|
||
if not skipna: | ||
mask = np.maximum.accumulate(mask) | ||
|
||
vals = accum_func(values) | ||
result = IntegerArray(vals, mask) | ||
|
||
elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): | ||
vals = values.copy() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import pytest | ||
|
||
import pandas as pd | ||
|
||
from .base import BaseExtensionTests | ||
|
||
|
||
class BaseAccumulateTests(BaseExtensionTests): | ||
""" | ||
Accumulation specific tests. Generally these only | ||
make sense for numeric/boolean operations. | ||
""" | ||
|
||
def check_accumulate(self, s, op_name, skipna): | ||
result = getattr(s, op_name)(skipna=skipna) | ||
expected = getattr(s.astype("float64"), op_name)(skipna=skipna) | ||
self.assert_series_equal(result, expected, check_dtype=False) | ||
|
||
|
||
class BaseNoAccumulateTests(BaseAccumulateTests): | ||
""" we don't define any accumulations """ | ||
|
||
@pytest.mark.parametrize("skipna", [True, False]) | ||
def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna): | ||
op_name = all_numeric_accumulations | ||
s = pd.Series(data) | ||
|
||
with pytest.raises(TypeError): | ||
getattr(s, op_name)(skipna=skipna) | ||
|
||
|
||
class BaseNumericAccumulateTests(BaseAccumulateTests): | ||
@pytest.mark.parametrize("skipna", [True, False]) | ||
def test_accumulate_series(self, data, all_numeric_accumulations, skipna): | ||
op_name = all_numeric_accumulations | ||
s = pd.Series(data) | ||
self.check_accumulate(s, op_name, skipna) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -249,6 +249,51 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): | |
pass | ||
|
||
|
||
class TestNumericAccumulation(base.BaseNumericAccumulateTests): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we already have some tests for sparse, pls add the AccumulateTests mixin there as well. |
||
def check_accumulate(self, s, op_name, skipna): | ||
# overwrite to ensure pd.NA is tested instead of np.nan | ||
# https://github.com/pandas-dev/pandas/issues/30958 | ||
if op_name == "cumsum": | ||
if s.dtype.name.startswith("U"): | ||
expected_dtype = "uint64" | ||
else: | ||
expected_dtype = "int64" | ||
result = getattr(s, op_name)(skipna=skipna) | ||
expected = pd.Series( | ||
integer_array( | ||
getattr(s.astype("float64"), op_name)(skipna=skipna), | ||
dtype=expected_dtype, | ||
) | ||
) | ||
tm.assert_series_equal(result, expected) | ||
elif op_name in ["cummax", "cummin"]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please try to make this simpler (the entire function), you are repeating lots of things |
||
expected_dtype = s.dtype | ||
result = getattr(s, op_name)(skipna=skipna) | ||
expected = pd.Series( | ||
integer_array( | ||
getattr(s.astype("float64"), op_name)(skipna=skipna), | ||
dtype=expected_dtype, | ||
) | ||
) | ||
tm.assert_series_equal(result, expected) | ||
elif op_name == "cumprod": | ||
if s.dtype.name.startswith("U"): | ||
expected_dtype = "uint64" | ||
else: | ||
expected_dtype = "int64" | ||
result = getattr(s[:20], op_name)(skipna=skipna) | ||
expected = pd.Series( | ||
integer_array( | ||
getattr(s[:20].astype("float64"), op_name)(skipna=skipna), | ||
dtype=expected_dtype, | ||
) | ||
) | ||
tm.assert_series_equal(result, expected) | ||
|
||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this needs a separate test for invalid names |
||
raise | ||
|
||
|
||
class TestPrinting(base.BasePrintingTests): | ||
pass | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use single backticks around the
:meth:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and move to 1.3