From c2d75928b75507d79723899e8717c50ede42c02e Mon Sep 17 00:00:00 2001 From: datajanko Date: Wed, 18 Sep 2019 21:28:09 +0200 Subject: [PATCH 01/89] define accumulation interface for ExtensionArrays --- pandas/core/arrays/base.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0778b6726d104..5946472f8031a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -65,6 +65,7 @@ class ExtensionArray: take unique view + _accumulate _concat_same_type _formatter _from_factorized @@ -114,8 +115,9 @@ class ExtensionArray: as they only compose abstract methods. Still, a more efficient implementation may be available, and these methods can be overridden. - One can implement methods to handle array reductions. + One can implement methods to handle array accumulations or reductions. + * _accumulate * _reduce One can implement methods to handle parsing from strings that will be used @@ -407,6 +409,7 @@ def isna(self) -> ArrayLike: * ``na_values._is_boolean`` should be True * `na_values` should implement :func:`ExtensionArray._reduce` + * `na_values` should implement :func:`ExtensionArray._accumulate` * ``na_values.any`` and ``na_values.all`` should be implemented """ raise AbstractMethodError(self) @@ -992,6 +995,35 @@ def _ndarray_values(self) -> np.ndarray: """ return np.array(self) + def _accumulate(self, name, skipna=True, **kwargs): + """ + Return an array result of performing the accumulation operation. + + Parameters + ---------- + name : str + Name of the function, supported values are: + { cummin, cummax, cumsum, cumprod }. + skipna : bool, default True + If True, skip NaN values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, no is the only supported kwarg. + + Returns + ------- + array + + Raises + ------ + TypeError : subclass does not define accumulations + """ + raise TypeError( + "cannot perform {name} with type {dtype}".format( + name=name, dtype=self.dtype + ) + ) + def _reduce(self, name, skipna=True, **kwargs): """ Return a scalar result of performing the reduction operation. From 2c149c0693d19930fb09fc8da5f1cf759892283a Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 19 Sep 2019 20:38:42 +0200 Subject: [PATCH 02/89] reformulate doc string --- pandas/core/arrays/base.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5946472f8031a..994c9584fe68c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -409,7 +409,6 @@ def isna(self) -> ArrayLike: * ``na_values._is_boolean`` should be True * `na_values` should implement :func:`ExtensionArray._reduce` - * `na_values` should implement :func:`ExtensionArray._accumulate` * ``na_values.any`` and ``na_values.all`` should be implemented """ raise AbstractMethodError(self) @@ -995,20 +994,27 @@ def _ndarray_values(self) -> np.ndarray: """ return np.array(self) - def _accumulate(self, name, skipna=True, **kwargs): + def _accumulate(self, name, skipna=True, **kwargs) -> ABCExtensionArray: """ - Return an array result of performing the accumulation operation. + Return an ExtensionArray performing the accumulation operation. + The underlying data type might change + # TODO Clarify Parameters ---------- name : str Name of the function, supported values are: - { cummin, cummax, cumsum, cumprod }. + # TODO Add function signatures + - cummin + - cummax + - cumsum + - cumprod skipna : bool, default True - If True, skip NaN values. + If True, skip NA values. **kwargs Additional keyword arguments passed to the accumulation function. - Currently, no is the only supported kwarg. + # TODO check if kwargs are needed + Currently, there is no supported kwarg. Returns ------- From 79cea1138c229f57a45727219b41948971715dbf Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 19 Sep 2019 20:50:46 +0200 Subject: [PATCH 03/89] creates baseExtension tests for accumulate --- pandas/tests/extension/base/accumulate.py | 60 +++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 pandas/tests/extension/base/accumulate.py diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py new file mode 100644 index 0000000000000..79a5d93b37469 --- /dev/null +++ b/pandas/tests/extension/base/accumulate.py @@ -0,0 +1,60 @@ +import warnings + +import pytest + +import pandas as pd +import pandas.util.testing as tm + +from .base import BaseExtensionTests + + +class BaseAccumulateTests(BaseExtensionTests): + """ + Accumulation specific tests. Generally these only + make sense for numeric/boolean operations. + """ + + def check_accumulate(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + tm.assert_almost_equal(result, expected) + + +class BaseNoAccumulateTests(BaseAccumulateTests): + """ we don't define any accumulation """ + + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna): + op_name = all_numeric_accumulations + s = pd.Series(data) + + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_boolean(self, data, all_boolean_accumulations, skipna): + op_name = all_boolean_accumulations + s = pd.Series(data) + + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) + + +class BaseNumericAccumulateTests(BaseAccumulateTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series(self, data, all_numeric_accumulations, skipna): + op_name = all_numeric_accumulations + s = pd.Series(data) + + # min/max with empty produce numpy warnings + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + self.check_accumulate(s, op_name, skipna) + + +class BaseBooleanAccumulateTests(BaseAccumulateTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series(self, data, all_boolean_accumulations, skipna): + op_name = all_boolean_accumulations + s = pd.Series(data) + self.check_accumulate(s, op_name, skipna) From 12a5ca30f000a137c6986e8798bba21ef64870a7 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 4 Oct 2019 22:05:05 +0200 Subject: [PATCH 04/89] adds fixtures for numeric_accumulations --- pandas/conftest.py | 11 +++++++++++ pandas/tests/extension/base/__init__.py | 1 + pandas/tests/extension/base/accumulate.py | 16 ---------------- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index b032e14d8f7e1..07228949daf8f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -229,6 +229,17 @@ def all_boolean_reductions(request): return request.param +_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"] + + +@pytest.fixture(params=_all_numeric_accumulations) +def all_numeric_accumulations(request): + """ + Fixture for numeric reduction names + """ + return request.param + + _cython_table = pd.core.base.SelectionMixin._cython_table.items() diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 090df35bd94c9..a27352e856e73 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -40,6 +40,7 @@ class TestMyDtype(BaseDtypeTests): ``assert_series_equal`` on your base test class. """ +# from .accumulate import BaseNoAccumulateTests, BaseNumericAccumulateTests from .casting import BaseCastingTests # noqa from .constructors import BaseConstructorsTests # noqa from .dtype import BaseDtypeTests # noqa diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 79a5d93b37469..e523e3e2c38ec 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -31,14 +31,6 @@ def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna with pytest.raises(TypeError): getattr(s, op_name)(skipna=skipna) - @pytest.mark.parametrize("skipna", [True, False]) - def test_accumulate_series_boolean(self, data, all_boolean_accumulations, skipna): - op_name = all_boolean_accumulations - s = pd.Series(data) - - with pytest.raises(TypeError): - getattr(s, op_name)(skipna=skipna) - class BaseNumericAccumulateTests(BaseAccumulateTests): @pytest.mark.parametrize("skipna", [True, False]) @@ -50,11 +42,3 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna): with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) self.check_accumulate(s, op_name, skipna) - - -class BaseBooleanAccumulateTests(BaseAccumulateTests): - @pytest.mark.parametrize("skipna", [True, False]) - def test_accumulate_series(self, data, all_boolean_accumulations, skipna): - op_name = all_boolean_accumulations - s = pd.Series(data) - self.check_accumulate(s, op_name, skipna) From dc959f45050c8e550dc88fdc7c7c5f17d797b823 Mon Sep 17 00:00:00 2001 From: datajanko Date: Wed, 13 Nov 2019 21:45:43 +0100 Subject: [PATCH 05/89] fixes typos --- pandas/conftest.py | 2 +- pandas/tests/extension/base/accumulate.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 07228949daf8f..a63f444bc30ef 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -235,7 +235,7 @@ def all_boolean_reductions(request): @pytest.fixture(params=_all_numeric_accumulations) def all_numeric_accumulations(request): """ - Fixture for numeric reduction names + Fixture for numeric accumulation names """ return request.param diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index e523e3e2c38ec..e35aa5b198a09 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -21,7 +21,7 @@ def check_accumulate(self, s, op_name, skipna): class BaseNoAccumulateTests(BaseAccumulateTests): - """ we don't define any accumulation """ + """ we don't define any accumulations """ @pytest.mark.parametrize("skipna", [True, False]) def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna): From bcfb8a835fd319adfa1d2525434ca8e67ad374f0 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 10 Dec 2019 22:21:19 +0100 Subject: [PATCH 06/89] adds accumulate tests for integer arrays --- pandas/tests/extension/base/__init__.py | 2 +- pandas/tests/extension/base/accumulate.py | 1 - pandas/tests/extension/test_integer.py | 4 ++++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index a27352e856e73..0497b631f693e 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -40,7 +40,7 @@ class TestMyDtype(BaseDtypeTests): ``assert_series_equal`` on your base test class. """ -# from .accumulate import BaseNoAccumulateTests, BaseNumericAccumulateTests +from .accumulate import BaseNoAccumulateTests, BaseNumericAccumulateTests # noqa from .casting import BaseCastingTests # noqa from .constructors import BaseConstructorsTests # noqa from .dtype import BaseDtypeTests # noqa diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index e35aa5b198a09..0f386ee9f06eb 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -38,7 +38,6 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna): op_name = all_numeric_accumulations s = pd.Series(data) - # min/max with empty produce numpy warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) self.check_accumulate(s, op_name, skipna) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index d051345fdd12d..8b0229bcac19a 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -229,6 +229,10 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): pass +class TestNumeriAccumulation(base.BaseNumericAccumulateTests): + pass + + class TestPrinting(base.BasePrintingTests): pass From 9a8f4ec98e2481f2f917da4268faa19c3dfabd3f Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 12 Dec 2019 18:32:00 +0100 Subject: [PATCH 07/89] fixes typo --- pandas/tests/extension/test_integer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 8b0229bcac19a..9ffe1241fe208 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -229,7 +229,7 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): pass -class TestNumeriAccumulation(base.BaseNumericAccumulateTests): +class TestNumericAccumulation(base.BaseNumericAccumulateTests): pass From 5d837d9def42a72d2fed3232e18f9c70fb9c8261 Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 9 Jan 2020 21:22:58 +0100 Subject: [PATCH 08/89] first implementation of cumsum --- pandas/core/arrays/integer.py | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 7b03bf35faf25..007a02a0f8e8b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -642,6 +642,56 @@ def cmp_method(self, other): name = "__{name}__".format(name=op.__name__) return set_function_name(cmp_method, name, cls) + def _accumulate(self, name, skipna=True, **kwargs): + data = self._data + mask = self._mask + + from ..nanops import _get_values + + if name == "cumsum": + fill_value = 0 + if name == "cumprod": + fill_value = 1 + if name == "cummax": + fill_value = data.min() + if name == "cummin": + fill_value = data.max() + + values, mask, dtype, dtype_max, fill_value = _get_values( + data, skipna=True, fill_value=fill_value, mask=mask, + ) + + if name == "cumsum": + return IntegerArray(values.cumsum(dtype=dtype_max), mask) + # # cumsum impute with 0 just add, afterwards replace again if needed + # # cumprod replace nan by 1, cumprod the, maybe float here necessary? + # # cummax, impute by min value np.maximum accumulate. Replace again + # # cummin, impute by max value, np.minimum.accumulate. replace again afterwards + # # coerce to a nan-aware float if needed + # if mask.any(): + # data = self._data.astype("float64") + # data[mask] = self._na_value + + # from ..nanops import _get_values + + # data[mask] = 0 + # self._data = data.cumsum() + + # values, mask, dtype, dtype_max, _ = _ get_values( + # data, skipna, fill_value=0, mask=mask + # ) + # the_cumsum = values.cumsum(axis=0, dtype=dtype_max) + # the_cumsum =_maybe_null_out(the_cumsum, axis=0, mask=mask, + # values=values.shape, min_count=min_count) + + # # TODO: check in nanops: + # # - _get_values + # # - _maybe_null_out + # # - _wrap_restuls + # # - _maybe_get_mask + + # return self + def _reduce(self, name, skipna=True, **kwargs): data = self._data mask = self._mask From 73363bfdde30eda50a90b26688f788eb9207de49 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 15 Mar 2020 08:51:14 +0100 Subject: [PATCH 09/89] stashed merge conflict --- pandas/core/generic.py | 19 +++++++++++++++++++ pandas/tests/extension/test_categorical.py | 2 ++ 2 files changed, 21 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8d56311331d4d..bec901a3d69f6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -100,6 +100,7 @@ from pandas.core.internals import BlockManager from pandas.core.missing import find_valid_index from pandas.core.ops import _align_method_FRAME +from pandas.core.dtypes.base import ExtensionDtype from pandas.io.formats import format as fmt from pandas.io.formats.format import DataFrameFormatter, format_percentiles @@ -11197,6 +11198,9 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): axis = self._stat_axis_number else: axis = self._get_axis_number(axis) + + if issubclass(self.dtype, ExtensionDtype): + return self._accumulate(name, skipna=skipna) if axis == 1: return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T @@ -11211,6 +11215,21 @@ def block_accum_func(blk_values): result = self._data.apply(block_accum_func) + # y = com.values_from_object(self).copy() + + # if skipna and issubclass(y.dtype.type, (np.datetime64, np.timedelta64)): + # result = accum_func(y, axis) + # mask = isna(self) + # np.putmask(result, mask, iNaT) + # elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)): + # mask = isna(self) + # np.putmask(y, mask, mask_a) + # result = accum_func(y, axis) + # np.putmask(result, mask, mask_b) + # # TODO: probably here, we need to call self._accumulate if the proper subclass is available + # else: + # result = accum_func(y, axis) + d = self._construct_axes_dict() d["copy"] = False return self._constructor(result, **d).__finalize__(self) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 059d3453995bd..1d5083739431f 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -161,6 +161,8 @@ def test_fillna_limit_backfill(self, data_missing): class TestReduce(base.BaseNoReduceTests): pass +class TestAccumulate(base.BaseNoAccumulateTests): + pass class TestMethods(base.BaseMethodsTests): @pytest.mark.skip(reason="Unobserved categories included") From 0d9a3d582f315cc8398939d6d5f1651684e4001a Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 15 Mar 2020 08:55:50 +0100 Subject: [PATCH 10/89] fixes formatting --- pandas/core/generic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bec901a3d69f6..ba68718059db7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -56,6 +56,7 @@ validate_percentile, ) +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_int64, ensure_object, @@ -100,7 +101,6 @@ from pandas.core.internals import BlockManager from pandas.core.missing import find_valid_index from pandas.core.ops import _align_method_FRAME -from pandas.core.dtypes.base import ExtensionDtype from pandas.io.formats import format as fmt from pandas.io.formats.format import DataFrameFormatter, format_percentiles @@ -11198,7 +11198,7 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): axis = self._stat_axis_number else: axis = self._get_axis_number(axis) - + if issubclass(self.dtype, ExtensionDtype): return self._accumulate(name, skipna=skipna) @@ -11226,10 +11226,11 @@ def block_accum_func(blk_values): # np.putmask(y, mask, mask_a) # result = accum_func(y, axis) # np.putmask(result, mask, mask_b) - # # TODO: probably here, we need to call self._accumulate if the proper subclass is available + # # TODO: probably here, we need to call + # self._accumulate if the proper subclass is available # else: # result = accum_func(y, axis) - + d = self._construct_axes_dict() d["copy"] = False return self._constructor(result, **d).__finalize__(self) From 84a7d81236a7b3eb348ac39db6cc807928fea4a2 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 23 Mar 2020 22:29:50 +0100 Subject: [PATCH 11/89] first green test for integer extension arrays and cumsum --- pandas/core/generic.py | 6 ++++-- pandas/tests/extension/test_categorical.py | 2 ++ pandas/tests/extension/test_integer.py | 7 ++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba68718059db7..150a01ae45cf5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11199,8 +11199,10 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): else: axis = self._get_axis_number(axis) - if issubclass(self.dtype, ExtensionDtype): - return self._accumulate(name, skipna=skipna) + # mimicking from series._reduce, which delegates + delegate = self._values + if isinstance(delegate.dtype, ExtensionDtype): + return delegate._accumulate(name, skipna=skipna, **kwargs) if axis == 1: return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 1d5083739431f..0f01b5677cd85 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -161,9 +161,11 @@ def test_fillna_limit_backfill(self, data_missing): class TestReduce(base.BaseNoReduceTests): pass + class TestAccumulate(base.BaseNoAccumulateTests): pass + class TestMethods(base.BaseMethodsTests): @pytest.mark.skip(reason="Unobserved categories included") def test_value_counts(self, all_data, dropna): diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 80ffae1b9f596..adb62dc27fc03 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -249,7 +249,12 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): class TestNumericAccumulation(base.BaseNumericAccumulateTests): - pass + def check_accumulate(self, s, op_name, skipna): + # overwrite to ensure pd.NA is tested instead of np.nan + # https://github.com/pandas-dev/pandas/issues/30958 + result = getattr(s, op_name)(skipna=skipna) + expected = integer_array(getattr(s.astype("float64"), op_name)(skipna=skipna)) + tm.assert_extension_array_equal(result, expected) class TestPrinting(base.BasePrintingTests): From ce6869df4d549a53fc88eaa9705c1dee168298ea Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 2 Apr 2020 21:15:47 +0200 Subject: [PATCH 12/89] first passing tests for cummin and cummax --- pandas/core/arrays/integer.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 0cb4bfb2c1539..425a011af96b9 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -589,6 +589,13 @@ def _accumulate(self, name: str, skipna: bool = True, **kwargs): if name == "cumsum": return IntegerArray(values.cumsum(dtype=dtype_max), mask) + elif name == "cumprod": + return IntegerArray(values.cumprod(dtype=dtype_max), mask) + elif name == "cummax": + return np.maximum.accumulate(IntegerArray(values, mask)) + elif name == "cummin": + return np.minimum.accumulate(IntegerArray(values, mask)) + # # cumsum impute with 0 just add, afterwards replace again if needed # # cumprod replace nan by 1, cumprod the, maybe float here necessary? # # cummax, impute by min value np.maximum accumulate. Replace again From 3b5d1d8b3a0c0763076ef8f3a51cc1d1277fe1eb Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 5 Apr 2020 21:26:15 +0200 Subject: [PATCH 13/89] utilizes na_accum_func --- pandas/core/arrays/integer.py | 45 ++++++++++++++++++++--------------- pandas/core/generic.py | 1 + pandas/core/nanops.py | 27 +++++++++++++++++++++ 3 files changed, 54 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 425a011af96b9..3c70efa391aea 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -569,32 +569,39 @@ def cmp_method(self, other): return set_function_name(cmp_method, name, cls) def _accumulate(self, name: str, skipna: bool = True, **kwargs): - data = self._data - mask = self._mask + # data = self._data + # mask = self._mask - from ..nanops import _get_values + # from ..nanops import _get_values - if name == "cumsum": - fill_value = 0 - if name == "cumprod": - fill_value = 1 - if name == "cummax": - fill_value = data.min() - if name == "cummin": - fill_value = data.max() - - values, mask, dtype, dtype_max, fill_value = _get_values( - data, skipna=True, fill_value=fill_value, mask=mask, - ) + # if name == "cumsum": + # fill_value = 0 + # if name == "cumprod": + # fill_value = 1 + # if name == "cummax": + # fill_value = data.min() + # if name == "cummin": + # fill_value = data.max() + + # values, mask, dtype, dtype_max, fill_value = _get_values( + # data, skipna=True, fill_value=fill_value, mask=mask, + # ) + from ..nanops import na_accum_func if name == "cumsum": - return IntegerArray(values.cumsum(dtype=dtype_max), mask) + # return IntegerArray(values.cumsum(dtype=dtype_max), mask) + return na_accum_func(self, np.cumsum, skipna=skipna) elif name == "cumprod": - return IntegerArray(values.cumprod(dtype=dtype_max), mask) + # return IntegerArray(values.cumprod(dtype=dtype_max), mask) + return na_accum_func(self, np.cumprod, skipna=skipna) + elif name == "cummax": - return np.maximum.accumulate(IntegerArray(values, mask)) + # return np.maximum.accumulate(IntegerArray(values, mask)) + return na_accum_func(self, np.maximum.accumulate, skipna=skipna) + elif name == "cummin": - return np.minimum.accumulate(IntegerArray(values, mask)) + # return np.minimum.accumulate(IntegerArray(values, mask)) + return na_accum_func(self, np.minimum.accumulate, skipna=skipna) # # cumsum impute with 0 just add, afterwards replace again if needed # # cumprod replace nan by 1, cumprod the, maybe float here necessary? diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 150a01ae45cf5..aa25fc79caf49 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11200,6 +11200,7 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): axis = self._get_axis_number(axis) # mimicking from series._reduce, which delegates + # using na_accum_func_now delegate = self._values if isinstance(delegate.dtype, ExtensionDtype): return delegate._accumulate(name, skipna=skipna, **kwargs) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a5e70bd279d21..9435953506363 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1558,10 +1558,37 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: if isinstance(values, np.ndarray): result = result.view(orig_dtype) + else: # DatetimeArray result = type(values)._from_sequence(result, dtype=orig_dtype) + from pandas.core.arrays import IntegerArray + + if isinstance(values, IntegerArray): + data = values._data + mask = values._mask + + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: data.min(), + np.cumsum: 0, + np.minimum.accumulate: data.max(), + }[accum_func] + + values, mask, dtype, dtype_max, fill_value = _get_values( + data, skipna=skipna, fill_value=fill_value, mask=mask + ) + + if not skipna: + mask = np.maximum.accumulate(mask) + + vals = accum_func(values) + + from pandas import Series + + result = Series(IntegerArray(vals, mask)) + elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() mask = isna(vals) From 0337cb0405816fd2bb27d5698e86023f8d664c4a Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 5 Apr 2020 21:30:59 +0200 Subject: [PATCH 14/89] removes delegation leftover --- pandas/core/generic.py | 6 +++--- pandas/core/nanops.py | 5 +---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index aa25fc79caf49..e2682d340a5a8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11201,9 +11201,9 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): # mimicking from series._reduce, which delegates # using na_accum_func_now - delegate = self._values - if isinstance(delegate.dtype, ExtensionDtype): - return delegate._accumulate(name, skipna=skipna, **kwargs) + # delegate = self._values + # if isinstance(delegate.dtype, ExtensionDtype): + # return delegate._accumulate(name, skipna=skipna, **kwargs) if axis == 1: return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 9435953506363..7e9b7896c9b65 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1584,10 +1584,7 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: mask = np.maximum.accumulate(mask) vals = accum_func(values) - - from pandas import Series - - result = Series(IntegerArray(vals, mask)) + result = IntegerArray(vals, mask) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() From f0722f5c42f6b0dfe25371184f27f0651d8659ab Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 9 Apr 2020 21:42:33 +0200 Subject: [PATCH 15/89] creates running tests --- pandas/tests/extension/test_integer.py | 42 ++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index adb62dc27fc03..9b5c55196bf4c 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -252,9 +252,45 @@ class TestNumericAccumulation(base.BaseNumericAccumulateTests): def check_accumulate(self, s, op_name, skipna): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 - result = getattr(s, op_name)(skipna=skipna) - expected = integer_array(getattr(s.astype("float64"), op_name)(skipna=skipna)) - tm.assert_extension_array_equal(result, expected) + if op_name == "cumsum": + if s.dtype.name.startswith("U"): + expected_dtype = "UInt64" + else: + expected_dtype = "Int64" + result = getattr(s, op_name)(skipna=skipna) + expected = pd.Series( + integer_array( + getattr(s.astype("float64"), op_name)(skipna=skipna), + dtype=expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + elif op_name in ["cummax", "cummin"]: + expected_dtype = s.dtype + result = getattr(s, op_name)(skipna=skipna) + expected = pd.Series( + integer_array( + getattr(s.astype("float64"), op_name)(skipna=skipna), + dtype=expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + elif op_name == "cumprod": + if s.dtype.name.startswith("U"): + expected_dtype = "UInt64" + else: + expected_dtype = "Int64" + result = getattr(s[:20], op_name)(skipna=skipna) + expected = pd.Series( + integer_array( + getattr(s[:20].astype("float64"), op_name)(skipna=skipna), + dtype=expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + + else: + raise class TestPrinting(base.BasePrintingTests): From fa35b141e076a9cbac3f72adfd6a2f7e190d8c22 Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 9 Apr 2020 22:10:48 +0200 Subject: [PATCH 16/89] removes ABCExtensionArray Type hint --- pandas/core/arrays/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d11e78e3cd696..bc462aa14c2de 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1025,8 +1025,7 @@ def _concat_same_type( # of objects _can_hold_na = True - - def _accumulate(self, name, skipna=True, **kwargs) -> ABCExtensionArray: + def _accumulate(self, name, skipna=True, **kwargs): """ Return an ExtensionArray performing the accumulation operation. The underlying data type might change @@ -1062,7 +1061,6 @@ def _accumulate(self, name, skipna=True, **kwargs) -> ABCExtensionArray: ) ) - def _reduce(self, name, skipna=True, **kwargs): """ Return a scalar result of performing the reduction operation. From 185510b68cbc865a0696e35d4529b2f37b7aa36c Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 08:19:16 +0200 Subject: [PATCH 17/89] removes clutter from generic.py --- pandas/core/generic.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c2c642b9c68ed..7c22a47801dd0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -56,7 +56,6 @@ validate_percentile, ) -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_int64, ensure_object, @@ -11181,12 +11180,6 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): else: axis = self._get_axis_number(axis) - # mimicking from series._reduce, which delegates - # using na_accum_func_now - # delegate = self._values - # if isinstance(delegate.dtype, ExtensionDtype): - # return delegate._accumulate(name, skipna=skipna, **kwargs) - if axis == 1: return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T From 2ef9ebbd7a8bac636672e4e0f206c0c23a3d1241 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 08:27:16 +0200 Subject: [PATCH 18/89] removes clutter in _accumulate --- pandas/core/arrays/integer.py | 52 ----------------------------------- 1 file changed, 52 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 5a0e48c8f236a..636a923e791f6 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -559,69 +559,17 @@ def cmp_method(self, other): return set_function_name(cmp_method, name, cls) def _accumulate(self, name: str, skipna: bool = True, **kwargs): - # data = self._data - # mask = self._mask - - # from ..nanops import _get_values - - # if name == "cumsum": - # fill_value = 0 - # if name == "cumprod": - # fill_value = 1 - # if name == "cummax": - # fill_value = data.min() - # if name == "cummin": - # fill_value = data.max() - - # values, mask, dtype, dtype_max, fill_value = _get_values( - # data, skipna=True, fill_value=fill_value, mask=mask, - # ) from ..nanops import na_accum_func if name == "cumsum": - # return IntegerArray(values.cumsum(dtype=dtype_max), mask) return na_accum_func(self, np.cumsum, skipna=skipna) elif name == "cumprod": - # return IntegerArray(values.cumprod(dtype=dtype_max), mask) return na_accum_func(self, np.cumprod, skipna=skipna) - elif name == "cummax": - # return np.maximum.accumulate(IntegerArray(values, mask)) return na_accum_func(self, np.maximum.accumulate, skipna=skipna) - elif name == "cummin": - # return np.minimum.accumulate(IntegerArray(values, mask)) return na_accum_func(self, np.minimum.accumulate, skipna=skipna) - # # cumsum impute with 0 just add, afterwards replace again if needed - # # cumprod replace nan by 1, cumprod the, maybe float here necessary? - # # cummax, impute by min value np.maximum accumulate. Replace again - # # cummin, impute by max value, np.minimum.accumulate. replace again afterwards - # # coerce to a nan-aware float if needed - # if mask.any(): - # data = self._data.astype("float64") - # data[mask] = self._na_value - - # from ..nanops import _get_values - - # data[mask] = 0 - # self._data = data.cumsum() - - # values, mask, dtype, dtype_max, _ = _ get_values( - # data, skipna, fill_value=0, mask=mask - # ) - # the_cumsum = values.cumsum(axis=0, dtype=dtype_max) - # the_cumsum =_maybe_null_out(the_cumsum, axis=0, mask=mask, - # values=values.shape, min_count=min_count) - - # # TODO: check in nanops: - # # - _get_values - # # - _maybe_null_out - # # - _wrap_restuls - # # - _maybe_get_mask - - # return self - def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask From 7d898bd50352a7381a5d23c60e524f3e8902b167 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 15:52:54 +0200 Subject: [PATCH 19/89] adds typehints for ExtensionArray and IntegerArray --- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/integer.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bc462aa14c2de..cd6b30ce120f4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1025,7 +1025,7 @@ def _concat_same_type( # of objects _can_hold_na = True - def _accumulate(self, name, skipna=True, **kwargs): + def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray": """ Return an ExtensionArray performing the accumulation operation. The underlying data type might change diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 636a923e791f6..ae8447637e367 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -558,17 +558,18 @@ def cmp_method(self, other): name = f"__{op.__name__}__" return set_function_name(cmp_method, name, cls) - def _accumulate(self, name: str, skipna: bool = True, **kwargs): + def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray": from ..nanops import na_accum_func if name == "cumsum": - return na_accum_func(self, np.cumsum, skipna=skipna) + result = na_accum_func(self, np.cumsum, skipna=skipna) elif name == "cumprod": - return na_accum_func(self, np.cumprod, skipna=skipna) + resut = na_accum_func(self, np.cumprod, skipna=skipna) elif name == "cummax": - return na_accum_func(self, np.maximum.accumulate, skipna=skipna) + result = na_accum_func(self, np.maximum.accumulate, skipna=skipna) elif name == "cummin": - return na_accum_func(self, np.minimum.accumulate, skipna=skipna) + result = na_accum_func(self, np.minimum.accumulate, skipna=skipna) + return result def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data From 09b42be676640d1fb66cb2cd1872e7f2d55472cc Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 21:25:56 +0200 Subject: [PATCH 20/89] delegates the accumulate calls to extension arrays --- pandas/core/arrays/integer.py | 30 ++++++++++++++++++++---------- pandas/core/generic.py | 6 +++++- pandas/core/nanops.py | 23 ----------------------- 3 files changed, 25 insertions(+), 34 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index ae8447637e367..d078bcd38f5cf 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -559,16 +559,26 @@ def cmp_method(self, other): return set_function_name(cmp_method, name, cls) def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray": - from ..nanops import na_accum_func - - if name == "cumsum": - result = na_accum_func(self, np.cumsum, skipna=skipna) - elif name == "cumprod": - resut = na_accum_func(self, np.cumprod, skipna=skipna) - elif name == "cummax": - result = na_accum_func(self, np.maximum.accumulate, skipna=skipna) - elif name == "cummin": - result = na_accum_func(self, np.minimum.accumulate, skipna=skipna) + data = self._data + mask = self._mask + + cum_function, fill_value = { + "cumprod": (np.cumprod, 1), + "cummax": (np.maximum.accumulate, data.min()), + "cumsum": (np.cumsum, 0), + "cummin": (np.minimum.accumulate, data.max()), + }[name] + from ..nanops import _get_values + + values, mask, dtype, dtype_max, fill_value = _get_values( + data, skipna=skipna, fill_value=fill_value, mask=mask + ) + + if not skipna: + mask = np.maximum.accumulate(mask) + + vals = cum_function(values) + result = IntegerArray(vals, mask) return result def _reduce(self, name: str, skipna: bool = True, **kwargs): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7c22a47801dd0..aa62a38dfa67b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -84,6 +84,7 @@ import pandas as pd from pandas.core import missing, nanops import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype @@ -11191,7 +11192,10 @@ def block_accum_func(blk_values): result = result.T if hasattr(result, "T") else result return result - result = self._mgr.apply(block_accum_func) + if isinstance(self.values, ExtensionArray): + result = self.values._accumulate(name, skipna, **kwargs) + else: + result = self._mgr.apply(block_accum_func) # y = com.values_from_object(self).copy() diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a7dbfb51d4f23..0e68c24dc54d7 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1593,29 +1593,6 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: # DatetimeArray result = type(values)._from_sequence(result, dtype=orig_dtype) - from pandas.core.arrays import IntegerArray - - if isinstance(values, IntegerArray): - data = values._data - mask = values._mask - - fill_value = { - np.cumprod: 1, - np.maximum.accumulate: data.min(), - np.cumsum: 0, - np.minimum.accumulate: data.max(), - }[accum_func] - - values, mask, dtype, dtype_max, fill_value = _get_values( - data, skipna=skipna, fill_value=fill_value, mask=mask - ) - - if not skipna: - mask = np.maximum.accumulate(mask) - - vals = accum_func(values) - result = IntegerArray(vals, mask) - elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() mask = isna(vals) From af0dd24627d25702bc6578d54adaa459a9faab62 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 21:52:39 +0200 Subject: [PATCH 21/89] removes diff in nanops --- pandas/core/nanops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 0e68c24dc54d7..9494248a423a8 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1588,7 +1588,6 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: if isinstance(values, np.ndarray): result = result.view(orig_dtype) - else: # DatetimeArray result = type(values)._from_sequence(result, dtype=orig_dtype) From bc9a36ad5ec1a02eca3310a372a91bdc14c49554 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 21:58:12 +0200 Subject: [PATCH 22/89] removes unwanted pattern --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 9799771ca7854..680d28358f7f8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -746,7 +746,7 @@ def all_logical_operators(request): """ return request.param - + _all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"] From 38454a3ea4f46dccef213ae5b24aa02136947a65 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 12 Apr 2020 20:37:37 +0200 Subject: [PATCH 23/89] makes output types for sum and prod explicit --- pandas/core/arrays/integer.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d078bcd38f5cf..05fb7cd2a5e30 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -577,7 +577,16 @@ def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray if not skipna: mask = np.maximum.accumulate(mask) - vals = cum_function(values) + # makes target dtypes explicit since CI showed optimal UInt32 + # dtype on test data occasionally. This was different across systems + dtype_out = dtype + if name in ["cumsum", "cumprod"]: + if dtype.name.lower().startswith("u"): + dtype_out = "UInt64" + else: + dtype_out = "Int64" + + vals = cum_function(values, dtype=dtype_out) result = IntegerArray(vals, mask) return result From 5ecfa516f550b7d64f8b4bd2726eb89bdfcb62c4 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 13 Apr 2020 08:52:17 +0200 Subject: [PATCH 24/89] makes the base accumulate test more general by not comparing types --- pandas/tests/extension/base/accumulate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 0f386ee9f06eb..4cf28480fd3cb 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -17,7 +17,7 @@ class BaseAccumulateTests(BaseExtensionTests): def check_accumulate(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) expected = getattr(s.astype("float64"), op_name)(skipna=skipna) - tm.assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected, check_dtype=False) class BaseNoAccumulateTests(BaseAccumulateTests): From 8d625943f0228583539068d9e8bec48100cf7958 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 13 Apr 2020 08:59:16 +0200 Subject: [PATCH 25/89] implements accumulation for boolean arrays --- pandas/core/arrays/boolean.py | 29 ++++++++++++++++++++++++++ pandas/tests/extension/test_boolean.py | 4 ++++ 2 files changed, 33 insertions(+) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index e85534def6b97..03afc1c9c758c 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -688,6 +688,35 @@ def cmp_method(self, other): name = f"__{op.__name__}" return set_function_name(cmp_method, name, cls) + def _accumulate( + self, name: str, skipna: bool = True, **kwargs + ): # TODO Type hints not working propery here due to circular imports + data = self._data + mask = self._mask + + cum_function, fill_value = { + "cumprod": (np.cumprod, 1), + "cummax": (np.maximum.accumulate, False), + "cumsum": (np.cumsum, 0), + "cummin": (np.minimum.accumulate, True), + }[name] + from ..nanops import _get_values + + values, mask, dtype, dtype_max, fill_value = _get_values( + data, skipna=skipna, fill_value=fill_value, mask=mask + ) + + if not skipna: + mask = np.maximum.accumulate(mask) + + if name in ["cumsum", "cumprod"]: + from pandas.core.arrays import IntegerArray + + result = IntegerArray(cum_function(values, dtype="UInt64"), mask) + else: + result = BooleanArray(cum_function(values, dtype=bool), mask) + return result + def _reduce(self, name: str, skipna: bool = True, **kwargs): if name in {"any", "all"}: diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index e2331b69916fb..4955bcc982def 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -346,6 +346,10 @@ class TestUnaryOps(base.BaseUnaryOpsTests): pass +class TestNumericAccumulation(base.BaseNumericAccumulateTests): + pass + + # TODO parsing not yet supported # class TestParsing(base.BaseParsingTests): # pass From 5f3b624149fc0f344db280b11702ec04833465d6 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 26 Apr 2020 08:30:28 +0200 Subject: [PATCH 26/89] uses f-string in base.py --- pandas/core/arrays/base.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cd6b30ce120f4..8303c853cb1ca 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1055,11 +1055,7 @@ def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray": ------ TypeError : subclass does not define accumulations """ - raise TypeError( - "cannot perform {name} with type {dtype}".format( - name=name, dtype=self.dtype - ) - ) + raise TypeError(f"cannot perform {name} with type {self.dtype}") def _reduce(self, name, skipna=True, **kwargs): """ From 06d12860b41085e1f341264dcc42daefe94d8008 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sat, 2 May 2020 08:29:40 +0200 Subject: [PATCH 27/89] uses blockmanager also for extension arrays --- pandas/core/generic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index aa62a38dfa67b..8a18c9cf494c3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11187,15 +11187,15 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): def block_accum_func(blk_values): values = blk_values.T if hasattr(blk_values, "T") else blk_values - result = nanops.na_accum_func(values, accum_func, skipna=skipna) + if is_extension_array_dtype(values.dtype): + result = values._accumulate(name, skipna, **kwargs) + else: + result = nanops.na_accum_func(values, accum_func, skipna=skipna) result = result.T if hasattr(result, "T") else result return result - if isinstance(self.values, ExtensionArray): - result = self.values._accumulate(name, skipna, **kwargs) - else: - result = self._mgr.apply(block_accum_func) + result = self._mgr.apply(block_accum_func) # y = com.values_from_object(self).copy() From f7e3f4fab7243b0ddc0237e69691f52514444290 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 3 May 2020 12:07:18 +0200 Subject: [PATCH 28/89] fixes flake8 issues --- pandas/core/arrays/boolean.py | 2 +- pandas/core/generic.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index e278c1d23eeb7..2f8fbd0d015b5 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -680,7 +680,7 @@ def cmp_method(self, other): def _accumulate( self, name: str, skipna: bool = True, **kwargs - ): # TODO Type hints not working propery here due to circular imports + ): # TODO Type hints not working propery here due to circular imports data = self._data mask = self._mask diff --git a/pandas/core/generic.py b/pandas/core/generic.py index de40599fd63ab..c0b21abeb2895 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -84,7 +84,6 @@ import pandas as pd from pandas.core import missing, nanops import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype From b3ae86475aa6a5a5eed59e61fd0c41065cbd81a5 Mon Sep 17 00:00:00 2001 From: datajanko Date: Wed, 17 Jun 2020 21:46:27 +0200 Subject: [PATCH 29/89] removes uncommented code --- pandas/core/generic.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b0af05736bbe1..590032e353aa4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11534,27 +11534,6 @@ def block_accum_func(blk_values): result = self._mgr.apply(block_accum_func) - # y = com.values_from_object(self).copy() - - # if skipna and issubclass(y.dtype.type, (np.datetime64, np.timedelta64)): - # result = accum_func(y, axis) - # mask = isna(self) - # np.putmask(result, mask, iNaT) - # elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)): - # mask = isna(self) - # np.putmask(y, mask, mask_a) - # result = accum_func(y, axis) - # np.putmask(result, mask, mask_b) - # # TODO: probably here, we need to call - # self._accumulate if the proper subclass is available - # else: - # result = accum_func(y, axis) - - # TODO: check later if these 3 commands are necessary -> check for failing tests - # d = self._construct_axes_dict() - # d["copy"] = False - # return self._constructor(result, **d).__finalize__(self, method=name) - return self._constructor(result).__finalize__(self, method=name) return set_function_name(cum_func, name, cls) From 52e6486bfd82f75fc8d672f86db65767051c12ec Mon Sep 17 00:00:00 2001 From: datajanko Date: Wed, 17 Jun 2020 21:47:12 +0200 Subject: [PATCH 30/89] adds todo for runtime warning --- pandas/tests/extension/base/accumulate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 4cf28480fd3cb..8edad7236f3dc 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -38,6 +38,8 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna): op_name = all_numeric_accumulations s = pd.Series(data) + # TODO: check if needed, copied from reduce + # min/max with empty produce numpy warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) self.check_accumulate(s, op_name, skipna) From 99fb664d0b08ddba18dc65a7211783241c85aa8f Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 22 Jun 2020 21:54:20 +0200 Subject: [PATCH 31/89] reuses integer array to accumulate for booleans --- pandas/core/arrays/boolean.py | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index d7ffbee7a4775..85cdfa44c50a2 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -653,34 +653,12 @@ def cmp_method(self, other): name = f"__{op.__name__}" return set_function_name(cmp_method, name, cls) - def _accumulate( - self, name: str, skipna: bool = True, **kwargs - ): # TODO Type hints not working propery here due to circular imports - data = self._data - mask = self._mask - - cum_function, fill_value = { - "cumprod": (np.cumprod, 1), - "cummax": (np.maximum.accumulate, False), - "cumsum": (np.cumsum, 0), - "cummin": (np.minimum.accumulate, True), - }[name] - from ..nanops import _get_values - - values, mask, dtype, dtype_max, fill_value = _get_values( - data, skipna=skipna, fill_value=fill_value, mask=mask - ) - - if not skipna: - mask = np.maximum.accumulate(mask) - - if name in ["cumsum", "cumprod"]: - from pandas.core.arrays import IntegerArray + def _accumulate(self, name: str, skipna: bool = True, **kwargs): + from pandas.arrays import IntegerArray - result = IntegerArray(cum_function(values, dtype="UInt64"), mask) - else: - result = BooleanArray(cum_function(values, dtype=bool), mask) - return result + return IntegerArray(self._data.astype("int8"), self._mask)._accumulate( + name, skipna, **kwargs + ) def _reduce(self, name: str, skipna: bool = True, **kwargs): From d339250f84d211e9a972d6b1eeb2e56c7f0daffe Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 22 Jun 2020 22:06:05 +0200 Subject: [PATCH 32/89] removes runtimewarning catching --- pandas/tests/extension/base/accumulate.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 8edad7236f3dc..5e05b9bdbc297 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -1,5 +1,3 @@ -import warnings - import pytest import pandas as pd @@ -37,9 +35,4 @@ class BaseNumericAccumulateTests(BaseAccumulateTests): def test_accumulate_series(self, data, all_numeric_accumulations, skipna): op_name = all_numeric_accumulations s = pd.Series(data) - - # TODO: check if needed, copied from reduce - # min/max with empty produce numpy warnings - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - self.check_accumulate(s, op_name, skipna) + self.check_accumulate(s, op_name, skipna) From be6f9743d4d7475c022b45911c25d3d8d7556489 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 23 Jun 2020 21:38:31 +0200 Subject: [PATCH 33/89] removes TODOs --- pandas/core/arrays/base.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0e47156d2bd73..e6a8f98d2980f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1090,15 +1090,13 @@ def _concat_same_type( def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray": """ - Return an ExtensionArray performing the accumulation operation. + Return an ExtensionArray performing an accumulation operation. The underlying data type might change - # TODO Clarify Parameters ---------- name : str Name of the function, supported values are: - # TODO Add function signatures - cummin - cummax - cumsum @@ -1107,7 +1105,6 @@ def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray": If True, skip NA values. **kwargs Additional keyword arguments passed to the accumulation function. - # TODO check if kwargs are needed Currently, there is no supported kwarg. Returns From a902f4ed4e0ef944fe1d49c354de1244630acf3e Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 23 Jun 2020 21:40:01 +0200 Subject: [PATCH 34/89] adds accumulate to autosummary --- doc/source/reference/extensions.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index fe4113d100abf..050b867cc8aa6 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -32,6 +32,7 @@ objects. .. autosummary:: :toctree: api/ + api.extensions.ExtensionArray._accumulate api.extensions.ExtensionArray._concat_same_type api.extensions.ExtensionArray._formatter api.extensions.ExtensionArray._from_factorized From 64afb5b2c147b85a48949e31d6604f19d5b70996 Mon Sep 17 00:00:00 2001 From: datajanko Date: Wed, 24 Jun 2020 20:36:13 +0200 Subject: [PATCH 35/89] excludes datetime from propagating to _accumulate --- pandas/core/generic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 590032e353aa4..ff3c54d995435 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11524,7 +11524,10 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): def block_accum_func(blk_values): values = blk_values.T if hasattr(blk_values, "T") else blk_values - if is_extension_array_dtype(values.dtype): + if is_extension_array_dtype(values.dtype) and values.dtype.kind not in [ + "m", + "M", + ]: result = values._accumulate(name, skipna, **kwargs) else: result = nanops.na_accum_func(values, accum_func, skipna=skipna) From 1e5d77b217c1ff5b0ab681674a5a9cd24c4030bf Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 29 Jun 2020 22:00:00 +0200 Subject: [PATCH 36/89] uses pandas.testing instead of pandas.util.testing in accumulate --- pandas/tests/extension/base/accumulate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 5e05b9bdbc297..22cae345d8ea7 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas.testing as tm from .base import BaseExtensionTests From c95b490b7167ff787a8beeec891367f987ca953c Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 30 Jun 2020 08:20:46 +0200 Subject: [PATCH 37/89] replaces assert_almost_equal with assert_series_equal --- pandas/tests/extension/base/accumulate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 22cae345d8ea7..b508ab56426a3 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -15,7 +15,7 @@ class BaseAccumulateTests(BaseExtensionTests): def check_accumulate(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) expected = getattr(s.astype("float64"), op_name)(skipna=skipna) - tm.assert_almost_equal(result, expected, check_dtype=False) + tm.assert_series_equal(result, expected, check_dtype=False) class BaseNoAccumulateTests(BaseAccumulateTests): From dc669ded93599c30a64bc7bc3d8f917db6651599 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 30 Jun 2020 20:48:35 +0200 Subject: [PATCH 38/89] dtypes to lowercase --- pandas/tests/extension/test_integer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 244197ecaa308..a321261e6b167 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -255,9 +255,9 @@ def check_accumulate(self, s, op_name, skipna): # https://github.com/pandas-dev/pandas/issues/30958 if op_name == "cumsum": if s.dtype.name.startswith("U"): - expected_dtype = "UInt64" + expected_dtype = "uint64" else: - expected_dtype = "Int64" + expected_dtype = "int64" result = getattr(s, op_name)(skipna=skipna) expected = pd.Series( integer_array( @@ -278,9 +278,9 @@ def check_accumulate(self, s, op_name, skipna): tm.assert_series_equal(result, expected) elif op_name == "cumprod": if s.dtype.name.startswith("U"): - expected_dtype = "UInt64" + expected_dtype = "uint64" else: - expected_dtype = "Int64" + expected_dtype = "int64" result = getattr(s[:20], op_name)(skipna=skipna) expected = pd.Series( integer_array( From 08475a426896ba765179d42122c5b4844dbd2e40 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 30 Jun 2020 21:42:50 +0200 Subject: [PATCH 39/89] lowercase of uint and int64 dtype in _accumulate --- pandas/core/arrays/integer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 13e1003274a78..2e2daf8e38481 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -565,9 +565,9 @@ def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray dtype_out = dtype if name in ["cumsum", "cumprod"]: if dtype.name.lower().startswith("u"): - dtype_out = "UInt64" + dtype_out = "uint64" else: - dtype_out = "Int64" + dtype_out = "int64" vals = cum_function(values, dtype=dtype_out) result = IntegerArray(vals, mask) From 67fa99ace33e21750a9f38c4192cd65f93a7f6cb Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 21 Jul 2020 21:02:27 +0200 Subject: [PATCH 40/89] uses hint of @simonjayhawkins concerning assert series equals --- pandas/tests/extension/base/accumulate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index b508ab56426a3..3670e89f12ad8 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -1,7 +1,6 @@ import pytest import pandas as pd -import pandas.testing as tm from .base import BaseExtensionTests @@ -15,7 +14,7 @@ class BaseAccumulateTests(BaseExtensionTests): def check_accumulate(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) expected = getattr(s.astype("float64"), op_name)(skipna=skipna) - tm.assert_series_equal(result, expected, check_dtype=False) + self.assert_series_equal(result, expected, check_dtype=False) class BaseNoAccumulateTests(BaseAccumulateTests): From b3d3c812629e0b15ff687d955731ef4334ee1040 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sat, 25 Jul 2020 07:36:15 +0200 Subject: [PATCH 41/89] adds whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 43d1244c15d8a..c51c2a102a205 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -340,6 +340,8 @@ Other enhancements - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) - ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`) - :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for `yerr` and/or `xerr`, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) +- Added :meth:`api.extensionExtensionArray._accumulate` to the extension array interface. Implements this interface for :class: `IntegerArray` and :class: `BooleanArray` such that type coercion to `object` is avoided (:issue:`28385`) + .. --------------------------------------------------------------------------- From 8cb66f98446c0e2cbb3659d0015cbe54ed8119b8 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 10 Aug 2020 21:58:25 +0200 Subject: [PATCH 42/89] moves changes to 1.2.0 --- doc/source/whatsnew/v1.1.0.rst | 1 - doc/source/whatsnew/v1.2.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index abe9f7413f3ce..f752aac1aece1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -341,7 +341,6 @@ Other enhancements - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) - ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`) - :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for `yerr` and/or `xerr`, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) -- Added :meth:`api.extensionExtensionArray._accumulate` to the extension array interface. Implements this interface for :class: `IntegerArray` and :class: `BooleanArray` such that type coercion to `object` is avoided (:issue:`28385`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 94bb265c32e4c..ab1c8f1f19657 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -188,6 +188,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ +- Added :meth:`api.extensionExtensionArray._accumulate` to the extension array interface. Implements this interface for :class: `IntegerArray` and :class: `BooleanArray` such that type coercion to `object` is avoided (:issue:`28385`) - - From 6ba3ca948470e229455900fbb69fccff23a6c30e Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 5 Nov 2020 21:47:35 +0100 Subject: [PATCH 43/89] uses na_accum_func --- pandas/core/arrays/integer.py | 41 ++++++++++------------------------- pandas/core/nanops.py | 22 +++++++++++++++++++ 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 5e69bc0b7272e..6250ccf0c671c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -604,35 +604,18 @@ def _arith_method(self, other, op): return self._maybe_mask_result(result, mask, other, op_name) def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray": - data = self._data - mask = self._mask - - cum_function, fill_value = { - "cumprod": (np.cumprod, 1), - "cummax": (np.maximum.accumulate, data.min()), - "cumsum": (np.cumsum, 0), - "cummin": (np.minimum.accumulate, data.max()), - }[name] - from ..nanops import _get_values - - values, mask, dtype, dtype_max, fill_value = _get_values( - data, skipna=skipna, fill_value=fill_value, mask=mask - ) - - if not skipna: - mask = np.maximum.accumulate(mask) - - # makes target dtypes explicit since CI showed optimal UInt32 - # dtype on test data occasionally. This was different across systems - dtype_out = dtype - if name in ["cumsum", "cumprod"]: - if dtype.name.lower().startswith("u"): - dtype_out = "uint64" - else: - dtype_out = "int64" - - vals = cum_function(values, dtype=dtype_out) - result = IntegerArray(vals, mask) + cum_function = { + "cumprod": np.cumprod, + "cummax": np.maximum.accumulate, + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + }.get(name) + if not cum_function: + raise ValueError(f"{name} is not defined for IntegerArrays") + + from pandas.core.nanops import na_accum_func + + result = na_accum_func(self, cum_function, skipna=skipna) return result def sum(self, skipna=True, min_count=0, **kwargs): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 46ff4a0e2f612..530301fc3e38a 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1690,6 +1690,28 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: result = type(values)._simple_new( # type: ignore[attr-defined] result, dtype=orig_dtype ) + from pandas.core.arrays import IntegerArray + + if isinstance(values, IntegerArray): + data = values._data + mask = values._mask + + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: data.min(), + np.cumsum: 0, + np.minimum.accumulate: data.max(), + }[accum_func] + + values, mask, dtype, dtype_max, fill_value = _get_values( + data, skipna=skipna, fill_value=fill_value, mask=mask + ) + + if not skipna: + mask = np.maximum.accumulate(mask) + + vals = accum_func(values) + result = IntegerArray(vals, mask) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() From 55de384d0336a9265f897e466339c1463c0d2145 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sat, 16 Jan 2021 08:21:13 +0100 Subject: [PATCH 44/89] delegate to EAs _accumulate function in block mgr --- pandas/core/generic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ce1e962614c58..82b527e4e701a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10429,7 +10429,10 @@ def _accum_func(self, name: str, func, axis=None, skipna=True, *args, **kwargs): def block_accum_func(blk_values): values = blk_values.T if hasattr(blk_values, "T") else blk_values - result = nanops.na_accum_func(values, func, skipna=skipna) + if isinstance(values, ExtensionArray): + result = values._accumulate(name, skipna=skipna, **kwargs) + else: + result = nanops.na_accum_func(values, func, skipna=skipna) result = result.T if hasattr(result, "T") else result return result From 6a5b7f8dee1e28ec720411f879a759a35d66b058 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 19 Jan 2021 22:06:27 +0100 Subject: [PATCH 45/89] moves implementation from nanops to masked_accumulations --- .../core/array_algos/masked_accumulations.py | 69 +++++++++++++++++++ pandas/core/arrays/boolean.py | 7 -- pandas/core/arrays/integer.py | 15 ---- pandas/core/arrays/masked.py | 23 ++++++- pandas/core/nanops.py | 22 ------ 5 files changed, 91 insertions(+), 45 deletions(-) create mode 100644 pandas/core/array_algos/masked_accumulations.py diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py new file mode 100644 index 0000000000000..a737ddd990600 --- /dev/null +++ b/pandas/core/array_algos/masked_accumulations.py @@ -0,0 +1,69 @@ +from typing import Callable + +import numpy as np + +from pandas.core import nanops as no + +""" +masked_accumulations.py is for accumulation algorithms using a mask-based approach +for missing values. +""" + + +def _cum_func( + func: Callable, + values: np.ndarray, + mask: np.ndarray, + *, + skipna: bool = True, + min_count: int = 0, +): + """ + Accumulations for 1D masked array. + + Parameters + ---------- + func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + """ + try: + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: values.min(), + np.cumsum: 0, + np.minimum.accumulate: values.max(), + }[func] + except KeyError: + raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") + + values, mask, dtype, dtype_max, fill_value = no._get_values( + values, skipna=skipna, fill_value=fill_value, mask=mask + ) + + if not skipna: + mask = np.maximum.accumulate(mask) + + values = func(values) + return values, mask + + +def cumsum(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumsum, values, mask, skipna=skipna) + + +def cumprod(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumsum, values, mask, skipna=skipna) + + +def cummin(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna) + + +def cummax(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index c624115036486..2bc908186f7f4 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -682,13 +682,6 @@ def _arith_method(self, other, op): return self._maybe_mask_result(result, mask, other, op_name) - def _accumulate(self, name: str, skipna: bool = True, **kwargs): - from pandas.arrays import IntegerArray - - return IntegerArray(self._data.astype("int8"), self._mask)._accumulate( - name, skipna, **kwargs - ) - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): if name in {"any", "all"}: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 815c84779d30f..f8378fb7d1500 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -470,21 +470,6 @@ def _cmp_method(self, other, op): return BooleanArray(result, mask) - def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray": - cum_function = { - "cumprod": np.cumprod, - "cummax": np.maximum.accumulate, - "cumsum": np.cumsum, - "cummin": np.minimum.accumulate, - }.get(name) - if not cum_function: - raise ValueError(f"{name} is not defined for IntegerArrays") - - from pandas.core.nanops import na_accum_func - - result = na_accum_func(self, cum_function, skipna=skipna) - return result - def sum(self, *, skipna=True, min_count=0, **kwargs): nv.validate_sum((), kwargs) return super()._reduce("sum", skipna=skipna, min_count=min_count) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e4a98a54ee94c..f83186a05a9c1 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -22,7 +22,7 @@ from pandas.core import nanops from pandas.core.algorithms import factorize_array, take -from pandas.core.array_algos import masked_reductions +from pandas.core.array_algos import masked_accumulations, masked_reductions from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray from pandas.core.indexers import check_array_indexer @@ -413,3 +413,24 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return libmissing.NA return result + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> BaseMaskedArrayT: + data = self._data + mask = self._mask + + if name in {"cumsum", "cumprod", "cummin", "cummax"}: + op = getattr(masked_accumulations, name) + data, mask = op(data, mask, skipna=skipna, **kwargs) + + from pandas.core.arrays import BooleanArray, IntegerArray + + if isinstance(self, BooleanArray): + return IntegerArray(data, mask, copy=False) + + return type(self)(data, mask, copy=False) + + raise NotImplementedError( + "Accumlation {name} not implemented for BaseMaskedArray" + ) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e6e2f96f04ab9..fb9b20bd43d7c 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1734,28 +1734,6 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: result = type(values)._simple_new( # type: ignore[attr-defined] result, dtype=orig_dtype ) - from pandas.core.arrays import IntegerArray - - if isinstance(values, IntegerArray): - data = values._data - mask = values._mask - - fill_value = { - np.cumprod: 1, - np.maximum.accumulate: data.min(), - np.cumsum: 0, - np.minimum.accumulate: data.max(), - }[accum_func] - - values, mask, dtype, dtype_max, fill_value = _get_values( - data, skipna=skipna, fill_value=fill_value, mask=mask - ) - - if not skipna: - mask = np.maximum.accumulate(mask) - - vals = accum_func(values) - result = IntegerArray(vals, mask) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() From 9c63c64c07bd5a4effbe5dac1e787349144ade9a Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 21 Jan 2021 19:04:42 +0100 Subject: [PATCH 46/89] fixes typing annotations in base and masked --- pandas/core/arrays/base.py | 4 +++- pandas/core/arrays/masked.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3eec621768e75..2e5bb66ac3ffb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1203,7 +1203,9 @@ def _concat_same_type( # of objects _can_hold_na = True - def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray": + def _accumulate( + self: ExtensionArray, name: str, *, skipna=True, **kwargs + ) -> ExtensionArray: """ Return an ExtensionArray performing an accumulation operation. The underlying data type might change diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index f83186a05a9c1..fc5a4a4d13955 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -416,7 +416,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): def _accumulate( self, name: str, *, skipna: bool = True, **kwargs - ) -> BaseMaskedArrayT: + ) -> BaseMaskedArray: data = self._data mask = self._mask From 2f23499eaf0061d9b855586d6044b1e284059387 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 22 Jan 2021 07:49:22 +0100 Subject: [PATCH 47/89] fixes merge error --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 2ea79ec863534..7fcc649bdb8ca 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -21,7 +21,7 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops -from pandas.core.algorithms import factorize_array, take +from pandas.core.algorithms import factorize_array, isin, take from pandas.core.array_algos import masked_accumulations, masked_reductions from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray From a5b30e635fc2dc999a466663ae38589483469224 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 22 Jan 2021 08:26:15 +0100 Subject: [PATCH 48/89] fills na values without nanops --- pandas/core/array_algos/masked_accumulations.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index a737ddd990600..83e42f4d47250 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -2,8 +2,6 @@ import numpy as np -from pandas.core import nanops as no - """ masked_accumulations.py is for accumulation algorithms using a mask-based approach for missing values. @@ -42,9 +40,7 @@ def _cum_func( except KeyError: raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") - values, mask, dtype, dtype_max, fill_value = no._get_values( - values, skipna=skipna, fill_value=fill_value, mask=mask - ) + values[mask] = fill_value if not skipna: mask = np.maximum.accumulate(mask) From d22c8a02dc4ac7edeb419b52b2d6fccb72996198 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 25 Jan 2021 21:21:52 +0100 Subject: [PATCH 49/89] fixes incorrect call to cumsum and changes to cumprod --- pandas/core/array_algos/masked_accumulations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index 83e42f4d47250..d388ebc4b9b0d 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -54,7 +54,7 @@ def cumsum(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): def cumprod(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): - return _cum_func(np.cumsum, values, mask, skipna=skipna) + return _cum_func(np.cumprod, values, mask, skipna=skipna) def cummin(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): From a5866c7ecb05faefd940da45931c97eae7e23924 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 25 Jan 2021 21:23:16 +0100 Subject: [PATCH 50/89] add _accumulate to boolean --- pandas/core/arrays/boolean.py | 9 +++++++++ pandas/core/arrays/masked.py | 5 ----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index dd281a39907fd..0bdb255c340c8 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -691,6 +691,15 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return super()._reduce(name, skipna=skipna, **kwargs) + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> BaseMaskedArray: + from pandas.core.arrays import IntegerArray + + data = self._data.astype(int) + mask = self._mask + return IntegerArray(data, mask)._accumulate(name, skipna=skipna, **kwargs) + def _maybe_mask_result(self, result, mask, other, op_name: str): """ Parameters diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7fcc649bdb8ca..2ba995b9b45ec 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -438,11 +438,6 @@ def _accumulate( op = getattr(masked_accumulations, name) data, mask = op(data, mask, skipna=skipna, **kwargs) - from pandas.core.arrays import BooleanArray, IntegerArray - - if isinstance(self, BooleanArray): - return IntegerArray(data, mask, copy=False) - return type(self)(data, mask, copy=False) raise NotImplementedError( From 8255457076b1c46828a46abda17651eabd937dd5 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 25 Jan 2021 21:23:48 +0100 Subject: [PATCH 51/89] makes tests a lot easier - cumprod tests still fail --- pandas/tests/extension/test_boolean.py | 5 +- pandas/tests/extension/test_integer.py | 85 +++++++++++++------------- 2 files changed, 48 insertions(+), 42 deletions(-) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index e9993297919a8..55e5727c58d3d 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -406,7 +406,10 @@ class TestUnaryOps(base.BaseUnaryOpsTests): class TestNumericAccumulation(base.BaseNumericAccumulateTests): - pass + def check_accumulate(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) + tm.assert_series_equal(result, expected, check_dtype=False) # TODO parsing not yet supported diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index cfb070e391019..13b80c1eea754 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -250,47 +250,50 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): class TestNumericAccumulation(base.BaseNumericAccumulateTests): def check_accumulate(self, s, op_name, skipna): - # overwrite to ensure pd.NA is tested instead of np.nan - # https://github.com/pandas-dev/pandas/issues/30958 - if op_name == "cumsum": - if s.dtype.name.startswith("U"): - expected_dtype = "uint64" - else: - expected_dtype = "int64" - result = getattr(s, op_name)(skipna=skipna) - expected = pd.Series( - integer_array( - getattr(s.astype("float64"), op_name)(skipna=skipna), - dtype=expected_dtype, - ) - ) - tm.assert_series_equal(result, expected) - elif op_name in ["cummax", "cummin"]: - expected_dtype = s.dtype - result = getattr(s, op_name)(skipna=skipna) - expected = pd.Series( - integer_array( - getattr(s.astype("float64"), op_name)(skipna=skipna), - dtype=expected_dtype, - ) - ) - tm.assert_series_equal(result, expected) - elif op_name == "cumprod": - if s.dtype.name.startswith("U"): - expected_dtype = "uint64" - else: - expected_dtype = "int64" - result = getattr(s[:20], op_name)(skipna=skipna) - expected = pd.Series( - integer_array( - getattr(s[:20].astype("float64"), op_name)(skipna=skipna), - dtype=expected_dtype, - ) - ) - tm.assert_series_equal(result, expected) - - else: - raise + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) + tm.assert_series_equal(result, expected, check_dtype=False) + # # overwrite to ensure pd.NA is tested instead of np.nan + # # https://github.com/pandas-dev/pandas/issues/30958 + # if op_name == "cumsum": + # if s.dtype.name.startswith("U"): + # expected_dtype = "uint64" + # else: + # expected_dtype = "int64" + # result = getattr(s, op_name)(skipna=skipna) + # expected = pd.Series( + # integer_array( + # getattr(s.astype("float64"), op_name)(skipna=skipna), + # dtype=expected_dtype, + # ) + # ) + # tm.assert_series_equal(result, expected) + # elif op_name in ["cummax", "cummin"]: + # expected_dtype = s.dtype + # result = getattr(s, op_name)(skipna=skipna) + # expected = pd.Series( + # integer_array( + # getattr(s.astype("float64"), op_name)(skipna=skipna), + # dtype=expected_dtype, + # ) + # ) + # tm.assert_series_equal(result, expected) + # elif op_name == "cumprod": + # if s.dtype.name.startswith("U"): + # expected_dtype = "uint64" + # else: + # expected_dtype = "int64" + # result = getattr(s[:20], op_name)(skipna=skipna) + # expected = pd.Series( + # integer_array( + # getattr(s[:20].astype("float64"), op_name)(skipna=skipna), + # dtype=expected_dtype, + # ) + # ) + # tm.assert_series_equal(result, expected) + + # else: + # raise class TestPrinting(base.BasePrintingTests): From 483b6088ab453a9072cbac720701996eb9720939 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 26 Jan 2021 20:41:26 +0100 Subject: [PATCH 52/89] adds BaseNumericAccumulation for floating masked array --- pandas/tests/extension/test_floating.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 440d7391c558f..8889bf896727d 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -222,3 +222,7 @@ class TestPrinting(base.BasePrintingTests): class TestParsing(base.BaseParsingTests): pass + + +class TestNumericAccumulation(base.BaseNumericAccumulateTests): + pass From 150fd3b63130fd669615644def565beff66f321d Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 26 Jan 2021 21:05:43 +0100 Subject: [PATCH 53/89] tests no numeric accumulations according to _accumulate interface --- pandas/tests/extension/test_sparse.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 86f9080571459..3d331d8652397 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -444,3 +444,7 @@ def test_EA_types(self, engine, data): expected_msg = r".*must implement _from_sequence_of_strings.*" with pytest.raises(NotImplementedError, match=expected_msg): super().test_EA_types(engine, data) + + +class TestNoNumericAccumulations(base.BaseNoAccumulateTests): + pass From 80e2dc62045f22132ff86c74da174dcb2a827e9c Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 28 Jan 2021 19:04:20 +0100 Subject: [PATCH 54/89] uses NotImplementedError in base accumulate function --- pandas/core/arrays/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 30ca10376a0c5..19774f24253d2 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1261,9 +1261,9 @@ def _accumulate( Raises ------ - TypeError : subclass does not define accumulations + NotImplementedError : subclass does not define accumulations """ - raise TypeError(f"cannot perform {name} with type {self.dtype}") + raise NotImplementedError(f"cannot perform {name} with type {self.dtype}") def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ From dceab99d70485630d67fff2c6c35ee2cac9918a3 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 16 Feb 2021 20:50:39 +0100 Subject: [PATCH 55/89] ensures the fill values are data independent additionally, remove min_count as irrellevant --- pandas/core/array_algos/masked_accumulations.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index d388ebc4b9b0d..df26b4f6a958a 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -2,6 +2,8 @@ import numpy as np +from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype + """ masked_accumulations.py is for accumulation algorithms using a mask-based approach for missing values. @@ -14,7 +16,6 @@ def _cum_func( mask: np.ndarray, *, skipna: bool = True, - min_count: int = 0, ): """ Accumulations for 1D masked array. @@ -30,12 +31,21 @@ def _cum_func( skipna : bool, default True Whether to skip NA. """ + dtype_info = None + if is_float_dtype(values): + dtype_info = np.finfo(values.dtype.type) + elif is_integer_dtype(values): + dtype_info = np.iinfo(values.dtype.type) + else: + raise NotImplementedError( + f"No masked accumulation defined for dtype {values.dtype.type}" + ) try: fill_value = { np.cumprod: 1, - np.maximum.accumulate: values.min(), + np.maximum.accumulate: dtype_info.min, np.cumsum: 0, - np.minimum.accumulate: values.max(), + np.minimum.accumulate: dtype_info.max, }[func] except KeyError: raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") From 1c14f18bd45ee7eaf61688a14aca7c4ac7e2ff8f Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 16 Feb 2021 20:54:17 +0100 Subject: [PATCH 56/89] adds accumulation for datetimelikes in generic.py ensure that datetimelikes are wrapped create a twin of masked_accumulations for datetimelikes timedeltas also allow cumsum and cumprod, theoretically --- pandas/core/arrays/datetimelike.py | 67 ++++++++++++++++++++++++++++++ pandas/core/arrays/timedeltas.py | 19 +++++++++ pandas/core/generic.py | 4 ++ 3 files changed, 90 insertions(+) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b2629e606f8f5..320e8783b58d6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -64,6 +64,7 @@ from pandas.core import nanops, ops from pandas.core.algorithms import checked_add_with_arr, isin, unique1d, value_counts +from pandas.core.array_algos import datetimelike_accumulations from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import NDArrayBackedExtensionArray, ravel_compat import pandas.core.common as com @@ -1187,6 +1188,72 @@ def _time_shift(self, periods, freq=None): # to be passed explicitly. return self._generate_range(start=start, end=end, periods=None, freq=self.freq) + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> DatetimeLikeArrayT: + + data = self._data.copy() + + if name in {"cummin", "cummax"}: + op = getattr(datetimelike_accumulations, name) + data = op(data, skipna=skipna, **kwargs) + + return type(self)._simple_new(data, freq=self.freq, dtype=self.dtype) + + raise NotImplementedError( + f"Accumlation {name} not implemented for {type(self)}" + ) + + # func_map = {'cumprod' : np.cumprod, 'cummax':np.maximum.accumulate, + # 'cumsum': np.cumsum, 'cummim':np.minimum.accumulate } + # accum_func = func_map[name] + + # freq = self._freq + + # mask_a, mask_b = { + # np.cumprod: (1.0, np.nan), + # np.maximum.accumulate: (-np.inf, np.nan), + # np.cumsum: (0.0, np.nan), + # np.minimum.accumulate: (np.inf, np.nan), + # }[accum_func] + + # values = self._data + # # GH#30460, GH#29058 + # # numpy 1.18 started sorting NaTs at the end instead of beginning, + # # so we need to work around to maintain backwards-consistency. + # #orig_dtype = values.dtype + + # # We need to define mask before masking NaTs + # mask = isna(values) + + # if accum_func == np.minimum.accumulate: + # # Note: the accum_func comparison fails as an "is" comparison + # y = values.view("i8") + # y[mask] = np.iinfo(np.int64).max + # changed = True + # else: + # y = values + # changed = False + + # result = accum_func(y.view("i8"), axis=0) + # if skipna: + # result[mask] = iNaT + # elif accum_func == np.minimum.accumulate: + # # Restore NaTs that we masked previously + # nz = (~np.asarray(mask)).nonzero()[0] + # if len(nz): + # # everything up to the first non-na entry stays NaT + # result[: nz[0]] = iNaT + + # if changed: + # # restore NaT elements + # y[mask] = iNaT # TODO: could try/finally for this? + + # # DatetimeArray + # result = type(self)._simple_new( # type: ignore[attr-defined] + # result, + # ) + @unpack_zerodim_and_defer("__add__") def __add__(self, other): other_dtype = getattr(other, "dtype", None) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index e9160c92435a4..ee72829b575d9 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -47,6 +47,7 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr +from pandas.core.array_algos import datetimelike_accumulations from pandas.core.arrays import IntegerArray, datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com @@ -390,6 +391,24 @@ def std( return self._box_func(result) return self._from_backing_data(result) + # ---------------------------------------------------------------- + # Accumulations + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> TimedeltaArray: + + data = self._data.copy() + + if name in {"cumsum", "cumsum"}: + op = getattr(datetimelike_accumulations, name) + data = op(data, skipna=skipna, **kwargs) + + return type(self)._simple_new(data, freq=None, dtype=self.dtype) + + else: + return super()._accumulate(name, skipna=skipna, **kwargs) + # ---------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e9549eead0a42..735427da10373 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10305,6 +10305,10 @@ def _accum_func(self, name: str, func, axis=None, skipna=True, *args, **kwargs): def block_accum_func(blk_values): values = blk_values.T if hasattr(blk_values, "T") else blk_values + from pandas.core.construction import ensure_wrapped_if_datetimelike + + values = ensure_wrapped_if_datetimelike(values) + if isinstance(values, ExtensionArray): result = values._accumulate(name, skipna=skipna, **kwargs) else: From 597e9780f3542478789d0dab1d1b2275a50b1b71 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 16 Feb 2021 21:14:03 +0100 Subject: [PATCH 57/89] actually ads datetimelike accumulation algos --- .../array_algos/datetimelike_accumulations.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 pandas/core/array_algos/datetimelike_accumulations.py diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py new file mode 100644 index 0000000000000..f2e0bb0ba3ff6 --- /dev/null +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -0,0 +1,69 @@ +from typing import Callable + +import numpy as np + +from pandas._libs import iNaT + +from pandas.core.dtypes.missing import isna + +""" +datetimelke_accumulations.py is for accumulations of datetimelike extension arrays +""" + + +def _cum_func( + func: Callable, + values: np.ndarray, + *, + skipna: bool = True, +): + """ + Accumulations for 1D datetimelike arrays. + + Parameters + ---------- + func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + skipna : bool, default True + Whether to skip NA. + """ + try: + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: np.iinfo(np.int64).min, + np.cumsum: 0, + np.minimum.accumulate: np.iinfo(np.int64).max, + }[func] + except KeyError: + raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") + + mask = isna(values) + y = values.view("i8") + y[mask] = fill_value + + if not skipna: + # This is different compared to the recent implementation for datetimelikes + # but is the same as the implementation for masked arrays + mask = np.maximum.accumulate(mask) + + result = func(y) + result[mask] = iNaT + return result + + +def cumsum(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumsum, values, skipna=skipna) + + +def cumprod(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumprod, values, skipna=skipna) + + +def cummin(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.minimum.accumulate, values, skipna=skipna) + + +def cummax(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.maximum.accumulate, values, skipna=skipna) From 5ebe8eab1cf1dfa5f9b62e1b63e389b1b65af6f4 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 16 Feb 2021 21:42:51 +0100 Subject: [PATCH 58/89] fixes absolute imports --- pandas/core/array_algos/masked_accumulations.py | 5 ++++- pandas/tests/extension/base/accumulate.py | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index df26b4f6a958a..fee283227df57 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -2,7 +2,10 @@ import numpy as np -from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype +from pandas.core.dtypes.common import ( + is_float_dtype, + is_integer_dtype, +) """ masked_accumulations.py is for accumulation algorithms using a mask-based approach diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 3670e89f12ad8..632198b47099d 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -1,8 +1,7 @@ import pytest import pandas as pd - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseAccumulateTests(BaseExtensionTests): From 32367c07f72b37385492ce0794face39a21d39b1 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sat, 20 Feb 2021 08:19:48 +0100 Subject: [PATCH 59/89] changes error to catch to adhere to changed implementation --- pandas/tests/extension/base/accumulate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 632198b47099d..882f96572791c 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -24,7 +24,7 @@ def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna op_name = all_numeric_accumulations s = pd.Series(data) - with pytest.raises(TypeError): + with pytest.raises(NotImplementedError): getattr(s, op_name)(skipna=skipna) From 628611cf06b9c32dee5fa156022754e68f577c50 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 16:01:50 +0200 Subject: [PATCH 60/89] Remove blank line in old whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index fd5b5f28fccd5..e1f54c439ae9b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -342,7 +342,6 @@ Other enhancements - ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`) - :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for ``yerr`` and/or ``xerr``, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) - .. --------------------------------------------------------------------------- .. _whatsnew_110.notable_bug_fixes: From d8848453b09ab1aa4a3b45992b9c6f7d98c2023e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 16:10:20 +0200 Subject: [PATCH 61/89] Remove merge error --- pandas/core/arrays/boolean.py | 133 ---------------------------------- 1 file changed, 133 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index ad02beb397c77..33efea27a0c5e 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -5,7 +5,6 @@ TYPE_CHECKING, cast, ) -import warnings import numpy as np @@ -20,14 +19,10 @@ ) from pandas.core.dtypes.common import ( - is_bool_dtype, - is_float_dtype, - is_integer_dtype, is_list_like, is_numeric_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.inference import is_float from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -384,104 +379,6 @@ def _logical_method(self, other, op): # i.e. BooleanArray return self._maybe_mask_result(result, mask) - def _cmp_method(self, other, op): - from pandas.arrays import ( - FloatingArray, - IntegerArray, - ) - - if isinstance(other, (IntegerArray, FloatingArray)): - return NotImplemented - - mask = None - - if isinstance(other, BooleanArray): - other, mask = other._data, other._mask - - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError("can only perform ops with 1-d structures") - if len(self) != len(other): - raise ValueError("Lengths must match to compare") - - if other is libmissing.NA: - # numpy does not handle pd.NA well as "other" scalar (it returns - # a scalar False instead of an array) - result = np.zeros_like(self._data) - mask = np.ones_like(self._data) - else: - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - result = op(self._data, other) - - # nans propagate - if mask is None: - mask = self._mask.copy() - else: - mask = self._mask | mask - - return BooleanArray(result, mask, copy=False) - - def _arith_method(self, other, op): - mask = None - op_name = op.__name__ - - if isinstance(other, BooleanArray): - other, mask = other._data, other._mask - - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError("can only perform ops with 1-d structures") - if len(self) != len(other): - raise ValueError("Lengths must match") - - # nans propagate - if mask is None: - mask = self._mask - if other is libmissing.NA: - mask |= True - else: - mask = self._mask | mask - - if other is libmissing.NA: - # if other is NA, the result will be all NA and we can't run the - # actual op, so we need to choose the resulting dtype manually - if op_name in {"floordiv", "rfloordiv", "mod", "rmod", "pow", "rpow"}: - dtype = "int8" - else: - dtype = "bool" - result = np.zeros(len(self._data), dtype=dtype) - else: - if op_name in {"pow", "rpow"} and isinstance(other, np.bool_): - # Avoid DeprecationWarning: In future, it will be an error - # for 'np.bool_' scalars to be interpreted as an index - other = bool(other) - - with np.errstate(all="ignore"): - result = op(self._data, other) - - # divmod returns a tuple - if op_name == "divmod": - div, mod = result - return ( - self._maybe_mask_result(div, mask, other, "floordiv"), - self._maybe_mask_result(mod, mask, other, "mod"), - ) - - return self._maybe_mask_result(result, mask, other, op_name) - - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): - - if name in {"any", "all"}: - return getattr(self, name)(skipna=skipna, **kwargs) - - return super()._reduce(name, skipna=skipna, **kwargs) - def _accumulate( self, name: str, *, skipna: bool = True, **kwargs ) -> BaseMaskedArray: @@ -490,33 +387,3 @@ def _accumulate( data = self._data.astype(int) mask = self._mask return IntegerArray(data, mask)._accumulate(name, skipna=skipna, **kwargs) - - def _maybe_mask_result(self, result, mask, other, op_name: str): - """ - Parameters - ---------- - result : array-like - mask : array-like bool - other : scalar or array-like - op_name : str - """ - # if we have a float operand we are by-definition - # a float result - # or our op is a divide - if (is_float_dtype(other) or is_float(other)) or ( - op_name in ["rtruediv", "truediv"] - ): - from pandas.core.arrays import FloatingArray - - return FloatingArray(result, mask, copy=False) - - elif is_bool_dtype(result): - return BooleanArray(result, mask, copy=False) - - elif is_integer_dtype(result): - from pandas.core.arrays import IntegerArray - - return IntegerArray(result, mask, copy=False) - else: - result[mask] = np.nan - return result From 054ad9436aca6a57615ccd96f59631d005d813a3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 16:13:46 +0200 Subject: [PATCH 62/89] Fix additional merge errors --- pandas/core/arrays/timedeltas.py | 2 +- pandas/tests/extension/test_boolean.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index daefbffc42731..e18643c27cbf3 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -372,7 +372,7 @@ def _accumulate( data = self._data.copy() - if name in {"cumsum", "cumsum"}: + if name in {"cumsum", "cumprod"}: op = getattr(datetimelike_accumulations, name) data = op(data, skipna=skipna, **kwargs) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 03a02548b26ea..1c00ddff4b232 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -404,9 +404,6 @@ def check_accumulate(self, s, op_name, skipna): tm.assert_series_equal(result, expected, check_dtype=False) -# TODO parsing not yet supported -# class TestParsing(base.BaseParsingTests): -# pass class TestParsing(base.BaseParsingTests): pass From 64219d9a1ec787d0c023fa0af2dbe321428ed995 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 17:01:30 +0200 Subject: [PATCH 63/89] Refactor datetimelike accum funcs --- .../array_algos/datetimelike_accumulations.py | 69 ------------------- pandas/core/arrays/datetimelike.py | 5 +- pandas/core/arrays/timedeltas.py | 5 +- 3 files changed, 4 insertions(+), 75 deletions(-) delete mode 100644 pandas/core/array_algos/datetimelike_accumulations.py diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py deleted file mode 100644 index f2e0bb0ba3ff6..0000000000000 --- a/pandas/core/array_algos/datetimelike_accumulations.py +++ /dev/null @@ -1,69 +0,0 @@ -from typing import Callable - -import numpy as np - -from pandas._libs import iNaT - -from pandas.core.dtypes.missing import isna - -""" -datetimelke_accumulations.py is for accumulations of datetimelike extension arrays -""" - - -def _cum_func( - func: Callable, - values: np.ndarray, - *, - skipna: bool = True, -): - """ - Accumulations for 1D datetimelike arrays. - - Parameters - ---------- - func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate - values : np.ndarray - Numpy array with the values (can be of any dtype that support the - operation). - skipna : bool, default True - Whether to skip NA. - """ - try: - fill_value = { - np.cumprod: 1, - np.maximum.accumulate: np.iinfo(np.int64).min, - np.cumsum: 0, - np.minimum.accumulate: np.iinfo(np.int64).max, - }[func] - except KeyError: - raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") - - mask = isna(values) - y = values.view("i8") - y[mask] = fill_value - - if not skipna: - # This is different compared to the recent implementation for datetimelikes - # but is the same as the implementation for masked arrays - mask = np.maximum.accumulate(mask) - - result = func(y) - result[mask] = iNaT - return result - - -def cumsum(values: np.ndarray, *, skipna: bool = True): - return _cum_func(np.cumsum, values, skipna=skipna) - - -def cumprod(values: np.ndarray, *, skipna: bool = True): - return _cum_func(np.cumprod, values, skipna=skipna) - - -def cummin(values: np.ndarray, *, skipna: bool = True): - return _cum_func(np.minimum.accumulate, values, skipna=skipna) - - -def cummax(values: np.ndarray, *, skipna: bool = True): - return _cum_func(np.maximum.accumulate, values, skipna=skipna) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 32bf41cc91532..6416d765b9b7a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -115,7 +115,6 @@ mode, unique1d, ) -from pandas.core.array_algos import datetimelike_accumulations from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import ( NDArrayBackedExtensionArray, @@ -1433,8 +1432,8 @@ def _accumulate( data = self._data.copy() if name in {"cummin", "cummax"}: - op = getattr(datetimelike_accumulations, name) - data = op(data, skipna=skipna, **kwargs) + func = np.minimum.accumulate if name == "cummin" else np.maximum.accumulate + data = nanops.na_accum_func(data, func, skipna=skipna) return type(self)._simple_new(data, freq=self.freq, dtype=self.dtype) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index e18643c27cbf3..ddce78c8e4b5f 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -53,7 +53,6 @@ from pandas.core.dtypes.missing import isna from pandas.core import nanops -from pandas.core.array_algos import datetimelike_accumulations from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com @@ -373,8 +372,8 @@ def _accumulate( data = self._data.copy() if name in {"cumsum", "cumprod"}: - op = getattr(datetimelike_accumulations, name) - data = op(data, skipna=skipna, **kwargs) + func = np.cumsum if name == "cumsum" else np.cumprod + data = nanops.na_accum_func(data, func, skipna=skipna) return type(self)._simple_new(data, freq=None, dtype=self.dtype) From a8645db992a93ac67d1dcee656d56207b915b311 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 17:12:45 +0200 Subject: [PATCH 64/89] Remove unnecessary import --- pandas/core/array_algos/masked_accumulations.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index fee283227df57..43b2004867f43 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Callable import numpy as np @@ -34,7 +36,6 @@ def _cum_func( skipna : bool, default True Whether to skip NA. """ - dtype_info = None if is_float_dtype(values): dtype_info = np.finfo(values.dtype.type) elif is_integer_dtype(values): From 597dd84c13f3fa107f91ce65df5f3a69777c54d4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 17:25:43 +0200 Subject: [PATCH 65/89] Refactor tests --- pandas/tests/extension/test_integer.py | 84 ++++++++++++-------------- 1 file changed, 40 insertions(+), 44 deletions(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index e7df3199e5cbe..b4c7598c17b2f 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -226,50 +226,46 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): class TestNumericAccumulation(base.BaseNumericAccumulateTests): def check_accumulate(self, s, op_name, skipna): - result = getattr(s, op_name)(skipna=skipna) - expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) - tm.assert_series_equal(result, expected, check_dtype=False) - # # overwrite to ensure pd.NA is tested instead of np.nan - # # https://github.com/pandas-dev/pandas/issues/30958 - # if op_name == "cumsum": - # if s.dtype.name.startswith("U"): - # expected_dtype = "uint64" - # else: - # expected_dtype = "int64" - # result = getattr(s, op_name)(skipna=skipna) - # expected = pd.Series( - # integer_array( - # getattr(s.astype("float64"), op_name)(skipna=skipna), - # dtype=expected_dtype, - # ) - # ) - # tm.assert_series_equal(result, expected) - # elif op_name in ["cummax", "cummin"]: - # expected_dtype = s.dtype - # result = getattr(s, op_name)(skipna=skipna) - # expected = pd.Series( - # integer_array( - # getattr(s.astype("float64"), op_name)(skipna=skipna), - # dtype=expected_dtype, - # ) - # ) - # tm.assert_series_equal(result, expected) - # elif op_name == "cumprod": - # if s.dtype.name.startswith("U"): - # expected_dtype = "uint64" - # else: - # expected_dtype = "int64" - # result = getattr(s[:20], op_name)(skipna=skipna) - # expected = pd.Series( - # integer_array( - # getattr(s[:20].astype("float64"), op_name)(skipna=skipna), - # dtype=expected_dtype, - # ) - # ) - # tm.assert_series_equal(result, expected) - - # else: - # raise + # overwrite to ensure pd.NA is tested instead of np.nan + # https://github.com/pandas-dev/pandas/issues/30958 + if op_name == "cumsum": + if s.dtype.name.startswith("U"): + expected_dtype = "UInt64" + else: + expected_dtype = "Int64" + result = getattr(s, op_name)(skipna=skipna) + expected = pd.Series( + pd.array( + getattr(s.astype("float64"), op_name)(skipna=skipna), + dtype=expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + elif op_name in ["cummax", "cummin"]: + result = getattr(s, op_name)(skipna=skipna) + expected = pd.Series( + pd.array( + getattr(s.astype("float64"), op_name)(skipna=skipna), + dtype=s.dtype, + ) + ) + tm.assert_series_equal(result, expected) + elif op_name == "cumprod": + if s.dtype.name.startswith("U"): + expected_dtype = "UInt64" + else: + expected_dtype = "Int64" + result = getattr(s[:20], op_name)(skipna=skipna) + expected = pd.Series( + pd.array( + getattr(s[:20].astype("float64"), op_name)(skipna=skipna), + dtype=expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + + else: + raise class TestPrinting(base.BasePrintingTests): From 13b263375ffaf891ebd6ca9634a0b3b8c56f184d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 22:03:41 +0200 Subject: [PATCH 66/89] Skip test --- pandas/tests/extension/base/accumulate.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 882f96572791c..b3b7751fadf0a 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -12,12 +12,16 @@ class BaseAccumulateTests(BaseExtensionTests): def check_accumulate(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) + print(s.tolist()) + if result.dtype == pd.Float32Dtype() and op_name == "cumprod" and skipna: + pytest.skip("Float32 precision lead to large differences") + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) self.assert_series_equal(result, expected, check_dtype=False) class BaseNoAccumulateTests(BaseAccumulateTests): - """ we don't define any accumulations """ + """we don't define any accumulations""" @pytest.mark.parametrize("skipna", [True, False]) def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna): From 2acc7a8cd340500ada9ab9bb75d36dac74d9102e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 17 Aug 2022 09:41:00 +0200 Subject: [PATCH 67/89] Fix mypy --- pandas/core/array_algos/masked_accumulations.py | 1 + pandas/core/arrays/base.py | 4 +--- pandas/core/arrays/datetimelike.py | 12 +++++++----- pandas/core/arrays/timedeltas.py | 8 +++----- pandas/core/generic.py | 1 + 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index 43b2004867f43..375b92fcd9e26 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -36,6 +36,7 @@ def _cum_func( skipna : bool, default True Whether to skip NA. """ + dtype_info: np.iinfo | np.finfo if is_float_dtype(values): dtype_info = np.finfo(values.dtype.type) elif is_integer_dtype(values): diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c070481013ce2..99b95f6dc07ae 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1410,9 +1410,7 @@ def _concat_same_type( def _can_hold_na(self) -> bool: return self.dtype._can_hold_na - def _accumulate( - self: ExtensionArray, name: str, *, skipna=True, **kwargs - ) -> ExtensionArray: + def _accumulate(self, name: str, *, skipna=True, **kwargs) -> ExtensionArray: """ Return an ExtensionArray performing an accumulation operation. The underlying data type might change diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 6416d765b9b7a..6d0e065ce64ad 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1425,17 +1425,19 @@ def _time_shift( # to be passed explicitly. return self._generate_range(start=start, end=end, periods=None, freq=self.freq) - def _accumulate( - self, name: str, *, skipna: bool = True, **kwargs - ) -> DatetimeLikeArrayT: + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): data = self._data.copy() if name in {"cummin", "cummax"}: func = np.minimum.accumulate if name == "cummin" else np.maximum.accumulate - data = nanops.na_accum_func(data, func, skipna=skipna) + result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna)) - return type(self)._simple_new(data, freq=self.freq, dtype=self.dtype) + # error: Unexpected keyword argument "freq" for + # "_simple_new" of "NDArrayBacked" [call-arg] + return type(self)._simple_new( + result, freq=self.freq, dtype=self.dtype # type: ignore[call-arg] + ) raise NotImplementedError( f"Accumlation {name} not implemented for {type(self)}" diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index ddce78c8e4b5f..438533c79ac16 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -365,17 +365,15 @@ def std( # ---------------------------------------------------------------- # Accumulations - def _accumulate( - self, name: str, *, skipna: bool = True, **kwargs - ) -> TimedeltaArray: + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): data = self._data.copy() if name in {"cumsum", "cumprod"}: func = np.cumsum if name == "cumsum" else np.cumprod - data = nanops.na_accum_func(data, func, skipna=skipna) + result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna)) - return type(self)._simple_new(data, freq=None, dtype=self.dtype) + return type(self)._simple_new(result, freq=None, dtype=self.dtype) else: return super()._accumulate(name, skipna=skipna, **kwargs) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0a38ca344d51d..2e4a86137fafa 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11129,6 +11129,7 @@ def block_accum_func(blk_values): values = ensure_wrapped_if_datetimelike(values) + result: np.ndarray | ExtensionArray if isinstance(values, ExtensionArray): result = values._accumulate(name, skipna=skipna, **kwargs) else: From a410a88e8ba2b6898923c5388336db2483b66b4d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 17 Aug 2022 15:44:49 +0200 Subject: [PATCH 68/89] Fix dtype creation --- pandas/core/arrays/base.py | 6 +++--- pandas/tests/extension/test_integer.py | 23 +++++++++++++++-------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 99b95f6dc07ae..05440f8006951 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1411,9 +1411,9 @@ def _can_hold_na(self) -> bool: return self.dtype._can_hold_na def _accumulate(self, name: str, *, skipna=True, **kwargs) -> ExtensionArray: - """ - Return an ExtensionArray performing an accumulation operation. - The underlying data type might change + """Return an ExtensionArray performing an accumulation operation. + + The underlying data type might change. Parameters ---------- diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index b4c7598c17b2f..d1977d23c3c6a 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -16,6 +16,11 @@ import numpy as np import pytest +from pandas.compat import ( + IS64, + is_platform_windows, +) + import pandas as pd import pandas._testing as tm from pandas.api.types import ( @@ -228,11 +233,17 @@ class TestNumericAccumulation(base.BaseNumericAccumulateTests): def check_accumulate(self, s, op_name, skipna): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 + length = 64 + if not IS64 or is_platform_windows(): + if not s.dtype.itemsize == 8: + length = 32 + + if s.dtype.name.startswith("U"): + expected_dtype = f"UInt{length}" + else: + expected_dtype = f"Int{length}" + if op_name == "cumsum": - if s.dtype.name.startswith("U"): - expected_dtype = "UInt64" - else: - expected_dtype = "Int64" result = getattr(s, op_name)(skipna=skipna) expected = pd.Series( pd.array( @@ -251,10 +262,6 @@ def check_accumulate(self, s, op_name, skipna): ) tm.assert_series_equal(result, expected) elif op_name == "cumprod": - if s.dtype.name.startswith("U"): - expected_dtype = "UInt64" - else: - expected_dtype = "Int64" result = getattr(s[:20], op_name)(skipna=skipna) expected = pd.Series( pd.array( From a0665880adfb246cceb3f66a6ec3c0d066ac5227 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 17 Aug 2022 22:15:44 +0200 Subject: [PATCH 69/89] Fix cumprod tests --- pandas/tests/extension/test_integer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index d1977d23c3c6a..eeb92b3fc78c0 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -262,10 +262,10 @@ def check_accumulate(self, s, op_name, skipna): ) tm.assert_series_equal(result, expected) elif op_name == "cumprod": - result = getattr(s[:20], op_name)(skipna=skipna) + result = getattr(s[:12], op_name)(skipna=skipna) expected = pd.Series( pd.array( - getattr(s[:20].astype("float64"), op_name)(skipna=skipna), + getattr(s[:12].astype("float64"), op_name)(skipna=skipna), dtype=expected_dtype, ) ) From 580267bd41ccc98462b3d59dd50a639997db3536 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 18 Aug 2022 09:03:59 +0200 Subject: [PATCH 70/89] Fix docstring --- pandas/core/arrays/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 05440f8006951..bd90435c59e2e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1411,7 +1411,8 @@ def _can_hold_na(self) -> bool: return self.dtype._can_hold_na def _accumulate(self, name: str, *, skipna=True, **kwargs) -> ExtensionArray: - """Return an ExtensionArray performing an accumulation operation. + """ + Return an ExtensionArray performing an accumulation operation. The underlying data type might change. From 54aa8a877d218cd04466efe23cdc7e81ccbd48ee Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 18 Aug 2022 23:23:27 +0200 Subject: [PATCH 71/89] Adress review --- pandas/core/array_algos/masked_accumulations.py | 4 +++- pandas/core/arrays/masked.py | 11 +++-------- pandas/tests/extension/base/accumulate.py | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index 375b92fcd9e26..4583d950702f8 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -53,7 +53,9 @@ def _cum_func( np.minimum.accumulate: dtype_info.max, }[func] except KeyError: - raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") + raise NotImplementedError( + f"No accumulation for {func} implemented on BaseMaskedArray" + ) values[mask] = fill_value diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index a4c490302293c..df6944b30141d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1266,12 +1266,7 @@ def _accumulate( data = self._data mask = self._mask - if name in {"cumsum", "cumprod", "cummin", "cummax"}: - op = getattr(masked_accumulations, name) - data, mask = op(data, mask, skipna=skipna, **kwargs) + op = getattr(masked_accumulations, name) + data, mask = op(data, mask, skipna=skipna, **kwargs) - return type(self)(data, mask, copy=False) - - raise NotImplementedError( - "Accumlation {name} not implemented for BaseMaskedArray" - ) + return type(self)(data, mask, copy=False) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index b3b7751fadf0a..8532ca7e6c529 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -12,7 +12,7 @@ class BaseAccumulateTests(BaseExtensionTests): def check_accumulate(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) - print(s.tolist()) + if result.dtype == pd.Float32Dtype() and op_name == "cumprod" and skipna: pytest.skip("Float32 precision lead to large differences") From 9c81a1d4029a729c0e20bf50e004af0bb9b1036c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 16 Nov 2022 15:41:10 +0100 Subject: [PATCH 72/89] Adress review --- pandas/core/array_algos/masked_accumulations.py | 3 +++ pandas/core/arrays/boolean.py | 16 ++++++++++++---- pandas/tests/extension/base/accumulate.py | 5 ++++- pandas/tests/extension/test_boolean.py | 4 ++++ 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index 4583d950702f8..469f4e1c9f32e 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -5,6 +5,7 @@ import numpy as np from pandas.core.dtypes.common import ( + is_bool_dtype, is_float_dtype, is_integer_dtype, ) @@ -41,6 +42,8 @@ def _cum_func( dtype_info = np.finfo(values.dtype.type) elif is_integer_dtype(values): dtype_info = np.iinfo(values.dtype.type) + elif is_bool_dtype(values): + dtype_info = np.iinfo(np.uint8) else: raise NotImplementedError( f"No masked accumulation defined for dtype {values.dtype.type}" diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 33efea27a0c5e..8ac665b1b2e11 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -26,6 +26,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ops +from pandas.core.array_algos import masked_accumulations from pandas.core.arrays.masked import ( BaseMaskedArray, BaseMaskedDtype, @@ -382,8 +383,15 @@ def _logical_method(self, other, op): def _accumulate( self, name: str, *, skipna: bool = True, **kwargs ) -> BaseMaskedArray: - from pandas.core.arrays import IntegerArray - - data = self._data.astype(int) + data = self._data mask = self._mask - return IntegerArray(data, mask)._accumulate(name, skipna=skipna, **kwargs) + if name in ("cummin", "cummax"): + op = getattr(masked_accumulations, name) + data, mask = op(data, mask, skipna=skipna, **kwargs) + return type(self)(data, mask, copy=False) + else: + from pandas.core.arrays import IntegerArray + + return IntegerArray(data.astype(int), mask)._accumulate( + name, skipna=skipna, **kwargs + ) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 8532ca7e6c529..e6f476ff0ac4f 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -14,7 +14,10 @@ def check_accumulate(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) if result.dtype == pd.Float32Dtype() and op_name == "cumprod" and skipna: - pytest.skip("Float32 precision lead to large differences") + pytest.skip( + f"Float32 precision lead to large differences with op {op_name} " + f"and skipna={skipna}" + ) expected = getattr(s.astype("float64"), op_name)(skipna=skipna) self.assert_series_equal(result, expected, check_dtype=False) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index c0039148b9d2f..a347aea4bf28b 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -16,6 +16,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_bool_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays.boolean import BooleanDtype @@ -394,6 +396,8 @@ def check_accumulate(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) tm.assert_series_equal(result, expected, check_dtype=False) + if op_name in ("cummin", "cummax"): + assert is_bool_dtype(result) class TestParsing(base.BaseParsingTests): From 6e2b4538b1f16f03cbb3dbab5fc29a8aad9d948e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 22 Nov 2022 21:20:57 +0000 Subject: [PATCH 73/89] Update pandas/core/arrays/base.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ef514eaddbe7d..f1f688bc0b59f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1370,7 +1370,7 @@ def _concat_same_type( def _can_hold_na(self) -> bool: return self.dtype._can_hold_na - def _accumulate(self, name: str, *, skipna=True, **kwargs) -> ExtensionArray: + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> ExtensionArray: """ Return an ExtensionArray performing an accumulation operation. From cdca590ae67c2de9f9e2bcb63e11de6e559aa838 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 22 Nov 2022 21:21:08 +0000 Subject: [PATCH 74/89] Update pandas/tests/extension/test_integer.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/extension/test_integer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index eeb92b3fc78c0..40254c79f96bf 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -272,7 +272,7 @@ def check_accumulate(self, s, op_name, skipna): tm.assert_series_equal(result, expected) else: - raise + raise NotImplementedError(f"{op_name} not supported") class TestPrinting(base.BasePrintingTests): From 6765fe12b0c9213d10806a73c12d072eccb1592f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 22 Nov 2022 21:47:44 +0000 Subject: [PATCH 75/89] Add comment --- pandas/core/array_algos/masked_accumulations.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index 469f4e1c9f32e..f1b412e898690 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -43,6 +43,7 @@ def _cum_func( elif is_integer_dtype(values): dtype_info = np.iinfo(values.dtype.type) elif is_bool_dtype(values): + # Max value has to be greater than 0 and min value should be zero dtype_info = np.iinfo(np.uint8) else: raise NotImplementedError( From d3be9f3a8131a49f1b9bc749ec1d304924f8c334 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 23 Nov 2022 00:22:00 +0000 Subject: [PATCH 76/89] Clarify comment --- pandas/core/array_algos/masked_accumulations.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index f1b412e898690..581801b3c3946 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -43,7 +43,9 @@ def _cum_func( elif is_integer_dtype(values): dtype_info = np.iinfo(values.dtype.type) elif is_bool_dtype(values): - # Max value has to be greater than 0 and min value should be zero + # Max value of bool is 1, but since we are setting into a boolean + # array, 255 is fine as well. Min value has to be 0 when setting + # into the boolean array. dtype_info = np.iinfo(np.uint8) else: raise NotImplementedError( From b4eb0fd7cc49d222dbcaadc163d8ef1392e143ec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 23 Nov 2022 00:23:03 +0000 Subject: [PATCH 77/89] Fix pre commit --- pandas/core/arrays/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cd39a09706b7f..5144083ea6772 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1370,7 +1370,9 @@ def _concat_same_type( def _can_hold_na(self) -> bool: return self.dtype._can_hold_na - def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> ExtensionArray: + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ExtensionArray: """ Return an ExtensionArray performing an accumulation operation. From 611b85ebf6a174d6e803528ede667a38f57dd3fd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 23 Nov 2022 00:38:37 +0000 Subject: [PATCH 78/89] Add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 02ea290995c8d..0e130d1d1e28a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -58,6 +58,7 @@ Other enhancements - Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`) - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`) +- Added ``cumsum``, ``cumprod``, ``cummin`` and ``cummax`` to the ``ExtensionArray`` interface via ``_accumulate`` (:issue:`28385`) - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`) - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`) - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`) From a6a974a811fc4373d6e13105d73bd91fe147648e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 29 Nov 2022 21:37:36 +0100 Subject: [PATCH 79/89] Move to top of file --- pandas/core/array_algos/masked_accumulations.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index 581801b3c3946..101397035e4ac 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -1,3 +1,8 @@ +""" +masked_accumulations.py is for accumulation algorithms using a mask-based approach +for missing values. +""" + from __future__ import annotations from typing import Callable @@ -10,11 +15,6 @@ is_integer_dtype, ) -""" -masked_accumulations.py is for accumulation algorithms using a mask-based approach -for missing values. -""" - def _cum_func( func: Callable, From e7364bd1b78885c7abf5807c4fab89a03b95793d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 29 Nov 2022 21:40:26 +0100 Subject: [PATCH 80/89] Change error --- pandas/core/array_algos/masked_accumulations.py | 14 +++++++++----- pandas/core/arrays/datetimelike.py | 4 +--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index 101397035e4ac..07113128e0947 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -9,6 +9,8 @@ import numpy as np +from pandas._typing import npt + from pandas.core.dtypes.common import ( is_bool_dtype, is_float_dtype, @@ -19,13 +21,15 @@ def _cum_func( func: Callable, values: np.ndarray, - mask: np.ndarray, + mask: npt.NDArray[np.bool_], *, skipna: bool = True, ): """ Accumulations for 1D masked array. + We will modify values in place to replace NAs with the appropriate fill value. + Parameters ---------- func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate @@ -72,17 +76,17 @@ def _cum_func( return values, mask -def cumsum(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): +def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): return _cum_func(np.cumsum, values, mask, skipna=skipna) -def cumprod(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): +def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): return _cum_func(np.cumprod, values, mask, skipna=skipna) -def cummin(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): +def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna) -def cummax(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): +def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e0a07171dffe1..04f5f3f0f62b2 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1411,9 +1411,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): result, freq=self.freq, dtype=self.dtype # type: ignore[call-arg] ) - raise NotImplementedError( - f"Accumlation {name} not implemented for {type(self)}" - ) + raise TypeError(f"Accumlation {name} not supported for {type(self)}") @unpack_zerodim_and_defer("__add__") def __add__(self, other): From 4ff6e4d118fe5454525dc77ac69a4c46019cc631 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 29 Nov 2022 21:42:06 +0100 Subject: [PATCH 81/89] Change _data --- pandas/core/arrays/datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 04f5f3f0f62b2..46afbad91b64b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1399,7 +1399,7 @@ def _time_shift( def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): - data = self._data.copy() + data = self._ndarray.copy() if name in {"cummin", "cummax"}: func = np.minimum.accumulate if name == "cummin" else np.maximum.accumulate From 57abcc331398012e5ec3fca5585f70e77ca46c96 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 29 Nov 2022 21:46:21 +0100 Subject: [PATCH 82/89] Remove --- pandas/core/generic.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b657c599362f6..1e4709addb42f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10859,10 +10859,6 @@ def _accum_func( def block_accum_func(blk_values): values = blk_values.T if hasattr(blk_values, "T") else blk_values - from pandas.core.construction import ensure_wrapped_if_datetimelike - - values = ensure_wrapped_if_datetimelike(values) - result: np.ndarray | ExtensionArray if isinstance(values, ExtensionArray): result = values._accumulate(name, skipna=skipna, **kwargs) From 12679618d8225c1e0bf782d082616d4cb06d142c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 30 Nov 2022 22:01:50 +0100 Subject: [PATCH 83/89] Add todo --- pandas/core/arrays/timedeltas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index cd0283e308120..cbc39d7123b84 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -418,6 +418,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): data = self._data.copy() if name in {"cumsum", "cumprod"}: + # TODO: cumprod should not work here func = np.cumsum if name == "cumsum" else np.cumprod result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna)) From c770872c7df1f6c5db9e2cb77b2fd314900f915e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 30 Nov 2022 22:02:33 +0100 Subject: [PATCH 84/89] Fix typo --- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/timedeltas.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 94af15809c6d2..0ebb0c7bd8af8 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1373,7 +1373,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): result, freq=self.freq, dtype=self.dtype # type: ignore[call-arg] ) - raise TypeError(f"Accumlation {name} not supported for {type(self)}") + raise TypeError(f"Accumulation {name} not supported for {type(self)}") @unpack_zerodim_and_defer("__add__") def __add__(self, other): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index cbc39d7123b84..bf14a4e51b44d 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -418,7 +418,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): data = self._data.copy() if name in {"cumsum", "cumprod"}: - # TODO: cumprod should not work here + # TODO: cumprod should not work here GH#48111 func = np.cumsum if name == "cumsum" else np.cumprod result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna)) From cb7277b8fb01e406baffbacd26151ccbca8bd0ea Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 30 Nov 2022 22:03:02 +0100 Subject: [PATCH 85/89] Adjust var --- pandas/tests/extension/base/accumulate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index e6f476ff0ac4f..c35f43bc8a2f4 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -29,15 +29,15 @@ class BaseNoAccumulateTests(BaseAccumulateTests): @pytest.mark.parametrize("skipna", [True, False]) def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna): op_name = all_numeric_accumulations - s = pd.Series(data) + ser = pd.Series(data) with pytest.raises(NotImplementedError): - getattr(s, op_name)(skipna=skipna) + getattr(ser, op_name)(skipna=skipna) class BaseNumericAccumulateTests(BaseAccumulateTests): @pytest.mark.parametrize("skipna", [True, False]) def test_accumulate_series(self, data, all_numeric_accumulations, skipna): op_name = all_numeric_accumulations - s = pd.Series(data) - self.check_accumulate(s, op_name, skipna) + ser = pd.Series(data) + self.check_accumulate(ser, op_name, skipna) From 797e7241fe873531590a2cfdfc2bcd3b467ee7a7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 1 Dec 2022 00:30:32 +0100 Subject: [PATCH 86/89] Special case --- pandas/core/arrays/datetimelike.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 0ebb0c7bd8af8..87b44216fd8c9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1361,7 +1361,10 @@ def _addsub_object_array(self, other: np.ndarray, op): def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): - data = self._ndarray.copy() + if is_period_dtype(self.dtype): + data = self + else: + data = self._ndarray.copy() if name in {"cummin", "cummax"}: func = np.minimum.accumulate if name == "cummin" else np.maximum.accumulate From ab3cf7ef2e0d7231c8e31d5b36ff89b4ca1bd20b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 10 Dec 2022 21:57:39 +0100 Subject: [PATCH 87/89] Fix tests --- pandas/core/arrays/timedeltas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index bf14a4e51b44d..e68b56c57c176 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -415,7 +415,7 @@ def std( def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): - data = self._data.copy() + data = self._ndarray.copy() if name in {"cumsum", "cumprod"}: # TODO: cumprod should not work here GH#48111 From e1d2a4e7c815ab2b5e9c59827878b7cac20c9e02 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 12 Dec 2022 17:52:29 +0100 Subject: [PATCH 88/89] Combine classes --- pandas/tests/extension/base/__init__.py | 5 +---- pandas/tests/extension/base/accumulate.py | 8 +------- pandas/tests/extension/test_boolean.py | 6 +++++- pandas/tests/extension/test_categorical.py | 6 ++++-- pandas/tests/extension/test_floating.py | 6 ++++-- pandas/tests/extension/test_integer.py | 6 +++++- pandas/tests/extension/test_sparse.py | 6 ++++-- 7 files changed, 24 insertions(+), 19 deletions(-) diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 0089aa08da65a..7e765cc5342d1 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -41,10 +41,7 @@ class TestMyDtype(BaseDtypeTests): ``assert_series_equal`` on your base test class. """ -from pandas.tests.extension.base.accumulate import ( # noqa - BaseNoAccumulateTests, - BaseNumericAccumulateTests, -) +from pandas.tests.extension.base.accumulate import BaseAccumulateTests # noqa from pandas.tests.extension.base.casting import BaseCastingTests # noqa from pandas.tests.extension.base.constructors import BaseConstructorsTests # noqa from pandas.tests.extension.base.dim2 import ( # noqa diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index c35f43bc8a2f4..868172f930844 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -22,20 +22,14 @@ def check_accumulate(self, s, op_name, skipna): expected = getattr(s.astype("float64"), op_name)(skipna=skipna) self.assert_series_equal(result, expected, check_dtype=False) - -class BaseNoAccumulateTests(BaseAccumulateTests): - """we don't define any accumulations""" - @pytest.mark.parametrize("skipna", [True, False]) - def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna): + def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna): op_name = all_numeric_accumulations ser = pd.Series(data) with pytest.raises(NotImplementedError): getattr(ser, op_name)(skipna=skipna) - -class BaseNumericAccumulateTests(BaseAccumulateTests): @pytest.mark.parametrize("skipna", [True, False]) def test_accumulate_series(self, data, all_numeric_accumulations, skipna): op_name = all_numeric_accumulations diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 557aebf167a00..b611701e4e429 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -395,7 +395,7 @@ class TestUnaryOps(base.BaseUnaryOpsTests): pass -class TestNumericAccumulation(base.BaseNumericAccumulateTests): +class TestAccumulation(base.BaseAccumulateTests): def check_accumulate(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) @@ -403,6 +403,10 @@ def check_accumulate(self, s, op_name, skipna): if op_name in ("cummin", "cummax"): assert is_bool_dtype(result) + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna): + pass + class TestParsing(base.BaseParsingTests): pass diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 5ba548d6dd222..9a363c6a0f022 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -156,8 +156,10 @@ class TestReduce(base.BaseNoReduceTests): pass -class TestAccumulate(base.BaseNoAccumulateTests): - pass +class TestAccumulate(base.BaseAccumulateTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series(self, data, all_numeric_accumulations, skipna): + pass class TestMethods(base.BaseMethodsTests): diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index e5b319063c415..f67f7dc56d26f 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -219,5 +219,7 @@ class Test2DCompat(base.Dim2CompatTests): pass -class TestNumericAccumulation(base.BaseNumericAccumulateTests): - pass +class TestAccumulation(base.BaseAccumulateTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna): + pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index a831ccb6773f5..788a0bf46afc5 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -232,7 +232,7 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): pass -class TestNumericAccumulation(base.BaseNumericAccumulateTests): +class TestAccumulation(base.BaseAccumulateTests): def check_accumulate(self, s, op_name, skipna): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 @@ -277,6 +277,10 @@ def check_accumulate(self, s, op_name, skipna): else: raise NotImplementedError(f"{op_name} not supported") + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna): + pass + class TestPrinting(base.BasePrintingTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 0465d352d1d82..022e5cb764e14 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -484,5 +484,7 @@ def test_EA_types(self, engine, data): super().test_EA_types(engine, data) -class TestNoNumericAccumulations(base.BaseNoAccumulateTests): - pass +class TestNoNumericAccumulations(base.BaseAccumulateTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series(self, data, all_numeric_accumulations, skipna): + pass From e7dbd5fd7be8cb44762a3321d2eb1a607439fa11 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 12 Dec 2022 22:21:39 +0100 Subject: [PATCH 89/89] Fix mypy --- pandas/core/arrays/datetimelike.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 0578616f31ba8..69b7d5a220d24 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1357,7 +1357,9 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): if is_period_dtype(self.dtype): data = self else: - data = self._ndarray.copy() + # Incompatible types in assignment (expression has type + # "ndarray[Any, Any]", variable has type "DatetimeLikeArrayMixin" + data = self._ndarray.copy() # type: ignore[assignment] if name in {"cummin", "cummax"}: func = np.minimum.accumulate if name == "cummin" else np.maximum.accumulate