From 8a00df4352b7efafeb37e7b5630a0b9efc710b97 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 Dec 2024 10:51:56 -0500 Subject: [PATCH 01/11] ENH: Implement cum* methods for PyArrow strings --- pandas/conftest.py | 20 ++++++++ pandas/core/arrays/arrow/array.py | 56 +++++++++++++++++++++++ pandas/tests/apply/test_str.py | 9 +--- pandas/tests/series/test_cumulative.py | 63 ++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 8 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 106518678df6a..e62e77eee492a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1317,6 +1317,26 @@ def nullable_string_dtype(request): return request.param +@pytest.fixture( + params=[ + pytest.param( + pd.StringDtype("pyarrow", na_value=np.nan), marks=td.skip_if_no("pyarrow") + ), + pytest.param( + pd.StringDtype("pyarrow", na_value=pd.NA), marks=td.skip_if_no("pyarrow") + ), + ] +) +def pyarrow_string_dtype(request): + """ + Parametrized fixture for string dtypes backed by Pyarrow. + + * 'pd.StringDtype("pyarrow", na_value=np.nan)' + * 'pd.StringDtype("pyarrow", na_value=pd.NA)' + """ + return request.param + + @pytest.fixture( params=[ "python", diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index afa219f611992..0fc1291622f63 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -41,6 +41,7 @@ is_list_like, is_numeric_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -1619,6 +1620,9 @@ def _accumulate( ------ NotImplementedError : subclass does not define accumulations """ + if is_string_dtype(self): + return self._str_accumulate(name=name, skipna=skipna, **kwargs) + pyarrow_name = { "cummax": "cumulative_max", "cummin": "cumulative_min", @@ -1654,6 +1658,58 @@ def _accumulate( return type(self)(result) + def _str_accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ArrowExtensionArray | ExtensionArray: + """ + Accumulate implementation for strings, see `_accumulate` docstring for details. + + pyarrow.compute does not implement these methods for strings. + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # When present and skipna is False, we stop of at the first NA value. + # as the tail becomes all NA values. + head: pa.array | None = None + tail: pa.array | None = None + pa_array = self._pa_array + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + if skipna: + if name == "cumsum": + pa_array = pc.fill_null(pa_array, "") + else: + pa_array = pc.fill_null_forward(pa_array) + nulls = pc.is_null(pa_array) + idx = pc.index(nulls, False).as_py() + if idx == -1: + idx = len(pa_array) + if idx > 0: + head = pa.array([""] * idx, type=pa_array.type) + pa_array = pa_array[idx:].combine_chunks() + else: + nulls = pc.is_null(pa_array) + idx = pc.index(nulls, True).as_py() + tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) + pa_array = pa_array[:idx].combine_chunks() + + pa_result = pa.array(np_func(pa_array), type=pa_array.type) + + if head is not None or tail is not None: + head = pa.array([], type=pa_array.type) if head is None else head + tail = pa.array([], type=pa_array.type) if tail is None else tail + pa_result = pa.concat_arrays([head, pa_result, tail]) + + result = type(self)(pa_result) + return result + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ Return a pyarrow scalar result of performing the reduction operation. diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index c52168ae48ca8..e224b07a1097b 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -159,17 +159,10 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(request, series, func, expected): +def test_agg_cython_table_transform_series(series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if series.dtype == "string" and func == "cumsum": - request.applymarker( - pytest.mark.xfail( - raises=(TypeError, NotImplementedError), - reason="TODO(infer_string) cumsum not yet implemented for string", - ) - ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index a9d5486139b46..c2c2588ca01ce 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -227,3 +227,66 @@ def test_cumprod_timedelta(self): ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() + + @pytest.mark.parametrize( + "data, skipna, expected_data", + [ + ([], True, []), + ([], False, []), + (["x", "z", "y"], True, ["x", "xz", "xzy"]), + (["x", "z", "y"], False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], True, ["x", "x", "xy"]), + (["x", pd.NA, "y"], False, ["x", pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cumsum_pyarrow_strings( + self, pyarrow_string_dtype, data, skipna, expected_data + ): + ser = pd.Series(data, dtype=pyarrow_string_dtype) + expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + result = ser.cumsum(skipna=skipna) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data, op, skipna, expected_data", + [ + ([], "cummin", True, []), + ([], "cummin", False, []), + (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), + (["y", "z", "x"], "cummin", False, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", True, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]), + ([pd.NA, "y", "x"], "cummin", True, ["", "y", "x"]), + ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummax", True, []), + ([], "cummax", False, []), + (["x", "z", "y"], "cummax", True, ["x", "z", "z"]), + (["x", "z", "y"], "cummax", False, ["x", "z", "z"]), + (["x", pd.NA, "y"], "cummax", True, ["x", "x", "y"]), + (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cummax", True, ["", "x", "y"]), + ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cummin_cummax_pyarrow_strings( + self, pyarrow_string_dtype, data, op, skipna, expected_data + ): + ser = pd.Series(data, dtype=pyarrow_string_dtype) + if expected_data is None: + expected_data = ser.dtype.na_value + method = getattr(ser, op) + expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + result = method(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) + msg = f"operation 'cumprod' not supported for dtype '{ser.dtype}'" + with pytest.raises(TypeError, match=msg): + ser.cumprod(skipna=skipna) From 170f2e2902673eb855d4855af2a3d2d48e20ee6a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 Dec 2024 11:11:41 -0500 Subject: [PATCH 02/11] cleanup --- pandas/core/arrays/arrow/array.py | 3 +-- pandas/tests/series/test_cumulative.py | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0fc1291622f63..974de43e8d8f9 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1670,8 +1670,7 @@ def _str_accumulate( msg = f"operation '{name}' not supported for dtype '{self.dtype}'" raise TypeError(msg) - # When present and skipna is False, we stop of at the first NA value. - # as the tail becomes all NA values. + # We may need to strip out leading / trailing NA values head: pa.array | None = None tail: pa.array | None = None pa_array = self._pa_array diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index c2c2588ca01ce..bab2ff776c3b1 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -244,6 +244,7 @@ def test_cumprod_timedelta(self): def test_cumsum_pyarrow_strings( self, pyarrow_string_dtype, data, skipna, expected_data ): + # https://github.com/pandas-dev/pandas/pull/60633 ser = pd.Series(data, dtype=pyarrow_string_dtype) expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) result = ser.cumsum(skipna=skipna) @@ -277,6 +278,7 @@ def test_cumsum_pyarrow_strings( def test_cummin_cummax_pyarrow_strings( self, pyarrow_string_dtype, data, op, skipna, expected_data ): + # https://github.com/pandas-dev/pandas/pull/60633 ser = pd.Series(data, dtype=pyarrow_string_dtype) if expected_data is None: expected_data = ser.dtype.na_value @@ -286,6 +288,7 @@ def test_cummin_cummax_pyarrow_strings( tm.assert_series_equal(result, expected) def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + # https://github.com/pandas-dev/pandas/pull/60633 ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) msg = f"operation 'cumprod' not supported for dtype '{ser.dtype}'" with pytest.raises(TypeError, match=msg): From 4ccf0d4ad77e025f79ff0377297b0676632b7963 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 Dec 2024 11:16:53 -0500 Subject: [PATCH 03/11] Cleanup --- pandas/core/arrays/arrow/array.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 974de43e8d8f9..bd22a78436067 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1685,7 +1685,10 @@ def _str_accumulate( if name == "cumsum": pa_array = pc.fill_null(pa_array, "") else: + # After the first non-NA value we can retain the running min/max + # by forward filling. pa_array = pc.fill_null_forward(pa_array) + # But any leading NA values should result in "". nulls = pc.is_null(pa_array) idx = pc.index(nulls, False).as_py() if idx == -1: @@ -1694,6 +1697,8 @@ def _str_accumulate( head = pa.array([""] * idx, type=pa_array.type) pa_array = pa_array[idx:].combine_chunks() else: + # When not skipping NA values, the result should be null from + # the first NA value onward. nulls = pc.is_null(pa_array) idx = pc.index(nulls, True).as_py() tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) @@ -1701,10 +1706,11 @@ def _str_accumulate( pa_result = pa.array(np_func(pa_array), type=pa_array.type) - if head is not None or tail is not None: - head = pa.array([], type=pa_array.type) if head is None else head - tail = pa.array([], type=pa_array.type) if tail is None else tail - pa_result = pa.concat_arrays([head, pa_result, tail]) + assert head is None or tail is None + if head is not None: + pa_result = pa.concat_arrays([head, pa_result]) + elif tail is not None: + pa_result = pa.concat_arrays([pa_result, tail]) result = type(self)(pa_result) return result From be726f0065dff04659ae3f1a1b61d7c6f718955e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 Dec 2024 11:28:07 -0500 Subject: [PATCH 04/11] fixup --- pandas/conftest.py | 12 ++++-------- pandas/tests/series/test_cumulative.py | 4 +++- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index e62e77eee492a..0b3654bbcc16e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1319,20 +1319,16 @@ def nullable_string_dtype(request): @pytest.fixture( params=[ - pytest.param( - pd.StringDtype("pyarrow", na_value=np.nan), marks=td.skip_if_no("pyarrow") - ), - pytest.param( - pd.StringDtype("pyarrow", na_value=pd.NA), marks=td.skip_if_no("pyarrow") - ), + pytest.param("str[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), ] ) def pyarrow_string_dtype(request): """ Parametrized fixture for string dtypes backed by Pyarrow. - * 'pd.StringDtype("pyarrow", na_value=np.nan)' - * 'pd.StringDtype("pyarrow", na_value=pd.NA)' + * 'str[pyarrow]' + * 'string[pyarrow]' """ return request.param diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index bab2ff776c3b1..ba850e8e2ed21 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -6,6 +6,8 @@ tests.frame.test_cumulative """ +import re + import numpy as np import pytest @@ -290,6 +292,6 @@ def test_cummin_cummax_pyarrow_strings( def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): # https://github.com/pandas-dev/pandas/pull/60633 ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) - msg = f"operation 'cumprod' not supported for dtype '{ser.dtype}'" + msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'") with pytest.raises(TypeError, match=msg): ser.cumprod(skipna=skipna) From bf38cef88654d5bc6f911ed9375f6fe675f20d01 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 Dec 2024 12:16:18 -0500 Subject: [PATCH 05/11] Fix extension tests --- pandas/tests/extension/base/accumulate.py | 5 ++-- pandas/tests/extension/test_arrow.py | 15 ++++++---- pandas/tests/extension/test_string.py | 7 +++++ pandas/tests/series/test_cumulative.py | 36 +++++++---------------- 4 files changed, 31 insertions(+), 32 deletions(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 9a41a3a582c4a..9a2f186c2a00b 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): try: alt = ser.astype("float64") - except TypeError: - # e.g. Period can't be cast to float64 + except (TypeError, ValueError): + # e.g. Period can't be cast to float64 (TypeError) + # String can't be cast to float64 (ValueError) alt = ser.astype(object) result = getattr(ser, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6dd1f3f15bc15..0ba0fc6790015 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -393,13 +393,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if ( - pa.types.is_string(pa_type) - or pa.types.is_binary(pa_type) - or pa.types.is_decimal(pa_type) - ): + if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type): if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: return False + elif pa.types.is_string(pa_type): + if op_name == "cumprod": + return False elif pa.types.is_boolean(pa_type): if op_name in ["cumprod", "cummax", "cummin"]: return False @@ -414,6 +413,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request): pa_type = data.dtype.pyarrow_dtype op_name = all_numeric_accumulations + + if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]: + # https://github.com/pandas-dev/pandas/pull/60633 + # Doesn't fit test structure, tested in series/test_cumulative.py instead. + return + ser = pd.Series(data) if not self._supports_accumulation(ser, op_name): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e19351b2ad058..6434487d67a4d 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -192,6 +192,13 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: and op_name in ("any", "all") ) + def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + return ser.dtype.storage == "pyarrow" and op_name in [ + "cummin", + "cummax", + "cumsum", + ] + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index ba850e8e2ed21..610903d77512d 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -230,31 +230,19 @@ def test_cumprod_timedelta(self): with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() - @pytest.mark.parametrize( - "data, skipna, expected_data", - [ - ([], True, []), - ([], False, []), - (["x", "z", "y"], True, ["x", "xz", "xzy"]), - (["x", "z", "y"], False, ["x", "xz", "xzy"]), - (["x", pd.NA, "y"], True, ["x", "x", "xy"]), - (["x", pd.NA, "y"], False, ["x", pd.NA, pd.NA]), - ([pd.NA, pd.NA, pd.NA], True, ["", "", ""]), - ([pd.NA, pd.NA, pd.NA], False, [pd.NA, pd.NA, pd.NA]), - ], - ) - def test_cumsum_pyarrow_strings( - self, pyarrow_string_dtype, data, skipna, expected_data - ): - # https://github.com/pandas-dev/pandas/pull/60633 - ser = pd.Series(data, dtype=pyarrow_string_dtype) - expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) - result = ser.cumsum(skipna=skipna) - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( "data, op, skipna, expected_data", [ + ([], "cumsum", True, []), + ([], "cumsum", False, []), + (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]), + (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], "cumsum", True, ["x", "x", "xy"]), + (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cumsum", True, ["", "x", "xy"]), + ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]), ([], "cummin", True, []), ([], "cummin", False, []), (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), @@ -277,13 +265,11 @@ def test_cumsum_pyarrow_strings( ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), ], ) - def test_cummin_cummax_pyarrow_strings( + def test_cum_methods_pyarrow_strings( self, pyarrow_string_dtype, data, op, skipna, expected_data ): # https://github.com/pandas-dev/pandas/pull/60633 ser = pd.Series(data, dtype=pyarrow_string_dtype) - if expected_data is None: - expected_data = ser.dtype.na_value method = getattr(ser, op) expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) result = method(skipna=skipna) From 1f8e36ec6fc38b9eefecb32a0a635a1da8bb6bc1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 1 Jan 2025 16:38:56 -0500 Subject: [PATCH 06/11] xfail test when there is no pyarrow --- pandas/tests/apply/test_str.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index e224b07a1097b..ce71cfec535e4 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,7 +4,10 @@ import numpy as np import pytest -from pandas.compat import WASM +from pandas.compat import ( + HAS_PYARROW, + WASM, +) from pandas.core.dtypes.common import is_number @@ -159,10 +162,17 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(series, func, expected): +def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW: + request.applymarker( + pytest.mark.xfail( + raises=NotImplementedError, + reason="TODO(infer_string) cumsum not yet implemented for string", + ) + ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) From dd8fcbe1b6391b3b15c5549962a6ae5f65434c1a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 1 Jan 2025 16:49:24 -0500 Subject: [PATCH 07/11] mypy fixups --- pandas/core/arrays/arrow/array.py | 3 ++- pandas/tests/extension/test_string.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index bd22a78436067..6a937822fadd4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1704,7 +1704,8 @@ def _str_accumulate( tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) pa_array = pa_array[:idx].combine_chunks() - pa_result = pa.array(np_func(pa_array), type=pa_array.type) + # error: Cannot call function of unknown type + pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] assert head is None or tail is None if head is not None: diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 6434487d67a4d..6ce48e434d329 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -24,6 +24,8 @@ from pandas.compat import HAS_PYARROW +from pandas.core.dtypes.base import StorageExtensionDtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -193,6 +195,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: ) def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + assert isinstance(ser.dtype, StorageExtensionDtype) return ser.dtype.storage == "pyarrow" and op_name in [ "cummin", "cummax", From 46ff2c13136aa2e4ce86e9dc02b4ff6845580ab5 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 5 Jan 2025 14:55:41 -0500 Subject: [PATCH 08/11] Change logic & whatsnew --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/arrow/array.py | 29 ++++++++++---------------- pandas/tests/series/test_cumulative.py | 18 ++++++++-------- 3 files changed, 21 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index b107a5d3ba100..9e0e095eb4de8 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -35,8 +35,8 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to work correctly with NumPy >= 2 (:issue:`57739`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) -- .. --------------------------------------------------------------------------- .. _whatsnew_230.notable_bug_fixes: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index bcf51579bc968..1c97e372ee7b5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1670,9 +1670,9 @@ def _str_accumulate( msg = f"operation '{name}' not supported for dtype '{self.dtype}'" raise TypeError(msg) - # We may need to strip out leading / trailing NA values - head: pa.array | None = None + # We may need to strip out trailing NA values tail: pa.array | None = None + na_mask: pa.array | None = None pa_array = self._pa_array np_func = { "cumsum": np.cumsum, @@ -1681,37 +1681,30 @@ def _str_accumulate( }[name] if self._hasna: + na_mask = pc.is_null(pa_array) + if pc.all(na_mask) == pa.scalar(True): + return type(self)(pa_array) if skipna: if name == "cumsum": pa_array = pc.fill_null(pa_array, "") else: - # After the first non-NA value we can retain the running min/max - # by forward filling. + # We can retain the running min/max by forward/backward filling. pa_array = pc.fill_null_forward(pa_array) - # But any leading NA values should result in "". - nulls = pc.is_null(pa_array) - idx = pc.index(nulls, False).as_py() - if idx == -1: - idx = len(pa_array) - if idx > 0: - head = pa.array([""] * idx, type=pa_array.type) - pa_array = pa_array[idx:].combine_chunks() + pa_array = pc.fill_null_backward(pa_array) else: # When not skipping NA values, the result should be null from # the first NA value onward. - nulls = pc.is_null(pa_array) - idx = pc.index(nulls, True).as_py() + idx = pc.index(na_mask, True).as_py() tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) pa_array = pa_array[:idx].combine_chunks() # error: Cannot call function of unknown type pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] - assert head is None or tail is None - if head is not None: - pa_result = pa.concat_arrays([head, pa_result]) - elif tail is not None: + if tail is not None: pa_result = pa.concat_arrays([pa_result, tail]) + elif na_mask is not None: + pa_result = pc.if_else(na_mask, None, pa_result) result = type(self)(pa_result) return result diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 610903d77512d..89882d9d797c5 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -237,31 +237,31 @@ def test_cumprod_timedelta(self): ([], "cumsum", False, []), (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]), (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]), - (["x", pd.NA, "y"], "cumsum", True, ["x", "x", "xy"]), + (["x", pd.NA, "y"], "cumsum", True, ["x", pd.NA, "xy"]), (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]), - ([pd.NA, "x", "y"], "cumsum", True, ["", "x", "xy"]), + ([pd.NA, "x", "y"], "cumsum", True, [pd.NA, "x", "xy"]), ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]), - ([pd.NA, pd.NA, pd.NA], "cumsum", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cumsum", True, [pd.NA, pd.NA, pd.NA]), ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]), ([], "cummin", True, []), ([], "cummin", False, []), (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), (["y", "z", "x"], "cummin", False, ["y", "y", "x"]), - (["y", pd.NA, "x"], "cummin", True, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", True, ["y", pd.NA, "x"]), (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]), - ([pd.NA, "y", "x"], "cummin", True, ["", "y", "x"]), + ([pd.NA, "y", "x"], "cummin", True, [pd.NA, "y", "x"]), ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]), - ([pd.NA, pd.NA, pd.NA], "cummin", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cummin", True, [pd.NA, pd.NA, pd.NA]), ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]), ([], "cummax", True, []), ([], "cummax", False, []), (["x", "z", "y"], "cummax", True, ["x", "z", "z"]), (["x", "z", "y"], "cummax", False, ["x", "z", "z"]), - (["x", pd.NA, "y"], "cummax", True, ["x", "x", "y"]), + (["x", pd.NA, "y"], "cummax", True, ["x", pd.NA, "y"]), (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]), - ([pd.NA, "x", "y"], "cummax", True, ["", "x", "y"]), + ([pd.NA, "x", "y"], "cummax", True, [pd.NA, "x", "y"]), ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]), - ([pd.NA, pd.NA, pd.NA], "cummax", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cummax", True, [pd.NA, pd.NA, pd.NA]), ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), ], ) From f2b448db4196d592e6b2980f6f69198e559729e4 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 5 Jan 2025 14:56:24 -0500 Subject: [PATCH 09/11] Change logic & whatsnew --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1c97e372ee7b5..900548a239c8e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1696,7 +1696,7 @@ def _str_accumulate( # the first NA value onward. idx = pc.index(na_mask, True).as_py() tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) - pa_array = pa_array[:idx].combine_chunks() + pa_array = pa_array[:idx] # error: Cannot call function of unknown type pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] From 4d11a1d68669457fbd3ed314aacd85594624c1ff Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 5 Jan 2025 14:58:55 -0500 Subject: [PATCH 10/11] Fix fixture --- pandas/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 0b3654bbcc16e..4b632388a6f30 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1319,8 +1319,8 @@ def nullable_string_dtype(request): @pytest.fixture( params=[ - pytest.param("str[pyarrow]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param(pd.StringDtype("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + pytest.param(pd.StringDtype("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), ] ) def pyarrow_string_dtype(request): From d3468ccd3f98e8e37163e832e2e02efd1b0572a1 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 5 Jan 2025 15:09:46 -0500 Subject: [PATCH 11/11] Fixup --- pandas/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 4b632388a6f30..f9c10a7758bd2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1319,8 +1319,8 @@ def nullable_string_dtype(request): @pytest.fixture( params=[ - pytest.param(pd.StringDtype("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), - pytest.param(pd.StringDtype("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), ] ) def pyarrow_string_dtype(request): @@ -1330,7 +1330,7 @@ def pyarrow_string_dtype(request): * 'str[pyarrow]' * 'string[pyarrow]' """ - return request.param + return pd.StringDtype(*request.param) @pytest.fixture(