From 8a00df4352b7efafeb37e7b5630a0b9efc710b97 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Tue, 31 Dec 2024 10:51:56 -0500
Subject: [PATCH 01/11] ENH: Implement cum* methods for PyArrow strings

---
 pandas/conftest.py                     | 20 ++++++++
 pandas/core/arrays/arrow/array.py      | 56 +++++++++++++++++++++++
 pandas/tests/apply/test_str.py         |  9 +---
 pandas/tests/series/test_cumulative.py | 63 ++++++++++++++++++++++++++
 4 files changed, 140 insertions(+), 8 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 106518678df6a..e62e77eee492a 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1317,6 +1317,26 @@ def nullable_string_dtype(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        pytest.param(
+            pd.StringDtype("pyarrow", na_value=np.nan), marks=td.skip_if_no("pyarrow")
+        ),
+        pytest.param(
+            pd.StringDtype("pyarrow", na_value=pd.NA), marks=td.skip_if_no("pyarrow")
+        ),
+    ]
+)
+def pyarrow_string_dtype(request):
+    """
+    Parametrized fixture for string dtypes backed by Pyarrow.
+
+    * 'pd.StringDtype("pyarrow", na_value=np.nan)'
+    * 'pd.StringDtype("pyarrow", na_value=pd.NA)'
+    """
+    return request.param
+
+
 @pytest.fixture(
     params=[
         "python",
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index afa219f611992..0fc1291622f63 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -41,6 +41,7 @@
     is_list_like,
     is_numeric_dtype,
     is_scalar,
+    is_string_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
@@ -1619,6 +1620,9 @@ def _accumulate(
         ------
         NotImplementedError : subclass does not define accumulations
         """
+        if is_string_dtype(self):
+            return self._str_accumulate(name=name, skipna=skipna, **kwargs)
+
         pyarrow_name = {
             "cummax": "cumulative_max",
             "cummin": "cumulative_min",
@@ -1654,6 +1658,58 @@ def _accumulate(
 
         return type(self)(result)
 
+    def _str_accumulate(
+        self, name: str, *, skipna: bool = True, **kwargs
+    ) -> ArrowExtensionArray | ExtensionArray:
+        """
+        Accumulate implementation for strings, see `_accumulate` docstring for details.
+
+        pyarrow.compute does not implement these methods for strings.
+        """
+        if name == "cumprod":
+            msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
+            raise TypeError(msg)
+
+        # When present and skipna is False, we stop of at the first NA value.
+        # as the tail becomes all NA values.
+        head: pa.array | None = None
+        tail: pa.array | None = None
+        pa_array = self._pa_array
+        np_func = {
+            "cumsum": np.cumsum,
+            "cummin": np.minimum.accumulate,
+            "cummax": np.maximum.accumulate,
+        }[name]
+
+        if self._hasna:
+            if skipna:
+                if name == "cumsum":
+                    pa_array = pc.fill_null(pa_array, "")
+                else:
+                    pa_array = pc.fill_null_forward(pa_array)
+                    nulls = pc.is_null(pa_array)
+                    idx = pc.index(nulls, False).as_py()
+                    if idx == -1:
+                        idx = len(pa_array)
+                    if idx > 0:
+                        head = pa.array([""] * idx, type=pa_array.type)
+                        pa_array = pa_array[idx:].combine_chunks()
+            else:
+                nulls = pc.is_null(pa_array)
+                idx = pc.index(nulls, True).as_py()
+                tail = pa.nulls(len(pa_array) - idx, type=pa_array.type)
+                pa_array = pa_array[:idx].combine_chunks()
+
+        pa_result = pa.array(np_func(pa_array), type=pa_array.type)
+
+        if head is not None or tail is not None:
+            head = pa.array([], type=pa_array.type) if head is None else head
+            tail = pa.array([], type=pa_array.type) if tail is None else tail
+            pa_result = pa.concat_arrays([head, pa_result, tail])
+
+        result = type(self)(pa_result)
+        return result
+
     def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar:
         """
         Return a pyarrow scalar result of performing the reduction operation.
diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
index c52168ae48ca8..e224b07a1097b 100644
--- a/pandas/tests/apply/test_str.py
+++ b/pandas/tests/apply/test_str.py
@@ -159,17 +159,10 @@ def test_agg_cython_table_series(series, func, expected):
         ),
     ),
 )
-def test_agg_cython_table_transform_series(request, series, func, expected):
+def test_agg_cython_table_transform_series(series, func, expected):
     # GH21224
     # test transforming functions in
     # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
-    if series.dtype == "string" and func == "cumsum":
-        request.applymarker(
-            pytest.mark.xfail(
-                raises=(TypeError, NotImplementedError),
-                reason="TODO(infer_string) cumsum not yet implemented for string",
-            )
-        )
     warn = None if isinstance(func, str) else FutureWarning
     with tm.assert_produces_warning(warn, match="is currently using Series.*"):
         result = series.agg(func)
diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
index a9d5486139b46..c2c2588ca01ce 100644
--- a/pandas/tests/series/test_cumulative.py
+++ b/pandas/tests/series/test_cumulative.py
@@ -227,3 +227,66 @@ def test_cumprod_timedelta(self):
         ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)])
         with pytest.raises(TypeError, match="cumprod not supported for Timedelta"):
             ser.cumprod()
+
+    @pytest.mark.parametrize(
+        "data, skipna, expected_data",
+        [
+            ([], True, []),
+            ([], False, []),
+            (["x", "z", "y"], True, ["x", "xz", "xzy"]),
+            (["x", "z", "y"], False, ["x", "xz", "xzy"]),
+            (["x", pd.NA, "y"], True, ["x", "x", "xy"]),
+            (["x", pd.NA, "y"], False, ["x", pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], False, [pd.NA, pd.NA, pd.NA]),
+        ],
+    )
+    def test_cumsum_pyarrow_strings(
+        self, pyarrow_string_dtype, data, skipna, expected_data
+    ):
+        ser = pd.Series(data, dtype=pyarrow_string_dtype)
+        expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
+        result = ser.cumsum(skipna=skipna)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "data, op, skipna, expected_data",
+        [
+            ([], "cummin", True, []),
+            ([], "cummin", False, []),
+            (["y", "z", "x"], "cummin", True, ["y", "y", "x"]),
+            (["y", "z", "x"], "cummin", False, ["y", "y", "x"]),
+            (["y", pd.NA, "x"], "cummin", True, ["y", "y", "x"]),
+            (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]),
+            ([pd.NA, "y", "x"], "cummin", True, ["", "y", "x"]),
+            ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cummin", True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]),
+            ([], "cummax", True, []),
+            ([], "cummax", False, []),
+            (["x", "z", "y"], "cummax", True, ["x", "z", "z"]),
+            (["x", "z", "y"], "cummax", False, ["x", "z", "z"]),
+            (["x", pd.NA, "y"], "cummax", True, ["x", "x", "y"]),
+            (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]),
+            ([pd.NA, "x", "y"], "cummax", True, ["", "x", "y"]),
+            ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cummax", True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]),
+        ],
+    )
+    def test_cummin_cummax_pyarrow_strings(
+        self, pyarrow_string_dtype, data, op, skipna, expected_data
+    ):
+        ser = pd.Series(data, dtype=pyarrow_string_dtype)
+        if expected_data is None:
+            expected_data = ser.dtype.na_value
+        method = getattr(ser, op)
+        expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
+        result = method(skipna=skipna)
+        tm.assert_series_equal(result, expected)
+
+    def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna):
+        ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype)
+        msg = f"operation 'cumprod' not supported for dtype '{ser.dtype}'"
+        with pytest.raises(TypeError, match=msg):
+            ser.cumprod(skipna=skipna)

From 170f2e2902673eb855d4855af2a3d2d48e20ee6a Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Tue, 31 Dec 2024 11:11:41 -0500
Subject: [PATCH 02/11] cleanup

---
 pandas/core/arrays/arrow/array.py      | 3 +--
 pandas/tests/series/test_cumulative.py | 3 +++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 0fc1291622f63..974de43e8d8f9 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -1670,8 +1670,7 @@ def _str_accumulate(
             msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
             raise TypeError(msg)
 
-        # When present and skipna is False, we stop of at the first NA value.
-        # as the tail becomes all NA values.
+        # We may need to strip out leading / trailing NA values
         head: pa.array | None = None
         tail: pa.array | None = None
         pa_array = self._pa_array
diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
index c2c2588ca01ce..bab2ff776c3b1 100644
--- a/pandas/tests/series/test_cumulative.py
+++ b/pandas/tests/series/test_cumulative.py
@@ -244,6 +244,7 @@ def test_cumprod_timedelta(self):
     def test_cumsum_pyarrow_strings(
         self, pyarrow_string_dtype, data, skipna, expected_data
     ):
+        # https://github.com/pandas-dev/pandas/pull/60633
         ser = pd.Series(data, dtype=pyarrow_string_dtype)
         expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
         result = ser.cumsum(skipna=skipna)
@@ -277,6 +278,7 @@ def test_cumsum_pyarrow_strings(
     def test_cummin_cummax_pyarrow_strings(
         self, pyarrow_string_dtype, data, op, skipna, expected_data
     ):
+        # https://github.com/pandas-dev/pandas/pull/60633
         ser = pd.Series(data, dtype=pyarrow_string_dtype)
         if expected_data is None:
             expected_data = ser.dtype.na_value
@@ -286,6 +288,7 @@ def test_cummin_cummax_pyarrow_strings(
         tm.assert_series_equal(result, expected)
 
     def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna):
+        # https://github.com/pandas-dev/pandas/pull/60633
         ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype)
         msg = f"operation 'cumprod' not supported for dtype '{ser.dtype}'"
         with pytest.raises(TypeError, match=msg):

From 4ccf0d4ad77e025f79ff0377297b0676632b7963 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Tue, 31 Dec 2024 11:16:53 -0500
Subject: [PATCH 03/11] Cleanup

---
 pandas/core/arrays/arrow/array.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 974de43e8d8f9..bd22a78436067 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -1685,7 +1685,10 @@ def _str_accumulate(
                 if name == "cumsum":
                     pa_array = pc.fill_null(pa_array, "")
                 else:
+                    # After the first non-NA value we can retain the running min/max
+                    # by forward filling.
                     pa_array = pc.fill_null_forward(pa_array)
+                    # But any leading NA values should result in "".
                     nulls = pc.is_null(pa_array)
                     idx = pc.index(nulls, False).as_py()
                     if idx == -1:
@@ -1694,6 +1697,8 @@ def _str_accumulate(
                         head = pa.array([""] * idx, type=pa_array.type)
                         pa_array = pa_array[idx:].combine_chunks()
             else:
+                # When not skipping NA values, the result should be null from
+                # the first NA value onward.
                 nulls = pc.is_null(pa_array)
                 idx = pc.index(nulls, True).as_py()
                 tail = pa.nulls(len(pa_array) - idx, type=pa_array.type)
@@ -1701,10 +1706,11 @@ def _str_accumulate(
 
         pa_result = pa.array(np_func(pa_array), type=pa_array.type)
 
-        if head is not None or tail is not None:
-            head = pa.array([], type=pa_array.type) if head is None else head
-            tail = pa.array([], type=pa_array.type) if tail is None else tail
-            pa_result = pa.concat_arrays([head, pa_result, tail])
+        assert head is None or tail is None
+        if head is not None:
+            pa_result = pa.concat_arrays([head, pa_result])
+        elif tail is not None:
+            pa_result = pa.concat_arrays([pa_result, tail])
 
         result = type(self)(pa_result)
         return result

From be726f0065dff04659ae3f1a1b61d7c6f718955e Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Tue, 31 Dec 2024 11:28:07 -0500
Subject: [PATCH 04/11] fixup

---
 pandas/conftest.py                     | 12 ++++--------
 pandas/tests/series/test_cumulative.py |  4 +++-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index e62e77eee492a..0b3654bbcc16e 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1319,20 +1319,16 @@ def nullable_string_dtype(request):
 
 @pytest.fixture(
     params=[
-        pytest.param(
-            pd.StringDtype("pyarrow", na_value=np.nan), marks=td.skip_if_no("pyarrow")
-        ),
-        pytest.param(
-            pd.StringDtype("pyarrow", na_value=pd.NA), marks=td.skip_if_no("pyarrow")
-        ),
+        pytest.param("str[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
     ]
 )
 def pyarrow_string_dtype(request):
     """
     Parametrized fixture for string dtypes backed by Pyarrow.
 
-    * 'pd.StringDtype("pyarrow", na_value=np.nan)'
-    * 'pd.StringDtype("pyarrow", na_value=pd.NA)'
+    * 'str[pyarrow]'
+    * 'string[pyarrow]'
     """
     return request.param
 
diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
index bab2ff776c3b1..ba850e8e2ed21 100644
--- a/pandas/tests/series/test_cumulative.py
+++ b/pandas/tests/series/test_cumulative.py
@@ -6,6 +6,8 @@
 tests.frame.test_cumulative
 """
 
+import re
+
 import numpy as np
 import pytest
 
@@ -290,6 +292,6 @@ def test_cummin_cummax_pyarrow_strings(
     def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna):
         # https://github.com/pandas-dev/pandas/pull/60633
         ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype)
-        msg = f"operation 'cumprod' not supported for dtype '{ser.dtype}'"
+        msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'")
         with pytest.raises(TypeError, match=msg):
             ser.cumprod(skipna=skipna)

From bf38cef88654d5bc6f911ed9375f6fe675f20d01 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Tue, 31 Dec 2024 12:16:18 -0500
Subject: [PATCH 05/11] Fix extension tests

---
 pandas/tests/extension/base/accumulate.py |  5 ++--
 pandas/tests/extension/test_arrow.py      | 15 ++++++----
 pandas/tests/extension/test_string.py     |  7 +++++
 pandas/tests/series/test_cumulative.py    | 36 +++++++----------------
 4 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py
index 9a41a3a582c4a..9a2f186c2a00b 100644
--- a/pandas/tests/extension/base/accumulate.py
+++ b/pandas/tests/extension/base/accumulate.py
@@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
     def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool):
         try:
             alt = ser.astype("float64")
-        except TypeError:
-            # e.g. Period can't be cast to float64
+        except (TypeError, ValueError):
+            # e.g. Period can't be cast to float64 (TypeError)
+            #      String can't be cast to float64 (ValueError)
             alt = ser.astype(object)
 
         result = getattr(ser, op_name)(skipna=skipna)
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 6dd1f3f15bc15..0ba0fc6790015 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -393,13 +393,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
         # attribute "pyarrow_dtype"
         pa_type = ser.dtype.pyarrow_dtype  # type: ignore[union-attr]
 
-        if (
-            pa.types.is_string(pa_type)
-            or pa.types.is_binary(pa_type)
-            or pa.types.is_decimal(pa_type)
-        ):
+        if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type):
             if op_name in ["cumsum", "cumprod", "cummax", "cummin"]:
                 return False
+        elif pa.types.is_string(pa_type):
+            if op_name == "cumprod":
+                return False
         elif pa.types.is_boolean(pa_type):
             if op_name in ["cumprod", "cummax", "cummin"]:
                 return False
@@ -414,6 +413,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
     def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request):
         pa_type = data.dtype.pyarrow_dtype
         op_name = all_numeric_accumulations
+
+        if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]:
+            # https://github.com/pandas-dev/pandas/pull/60633
+            # Doesn't fit test structure, tested in series/test_cumulative.py instead.
+            return
+
         ser = pd.Series(data)
 
         if not self._supports_accumulation(ser, op_name):
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index e19351b2ad058..6434487d67a4d 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -192,6 +192,13 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
             and op_name in ("any", "all")
         )
 
+    def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
+        return ser.dtype.storage == "pyarrow" and op_name in [
+            "cummin",
+            "cummax",
+            "cumsum",
+        ]
+
     def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
         dtype = cast(StringDtype, tm.get_dtype(obj))
         if op_name in ["__add__", "__radd__"]:
diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
index ba850e8e2ed21..610903d77512d 100644
--- a/pandas/tests/series/test_cumulative.py
+++ b/pandas/tests/series/test_cumulative.py
@@ -230,31 +230,19 @@ def test_cumprod_timedelta(self):
         with pytest.raises(TypeError, match="cumprod not supported for Timedelta"):
             ser.cumprod()
 
-    @pytest.mark.parametrize(
-        "data, skipna, expected_data",
-        [
-            ([], True, []),
-            ([], False, []),
-            (["x", "z", "y"], True, ["x", "xz", "xzy"]),
-            (["x", "z", "y"], False, ["x", "xz", "xzy"]),
-            (["x", pd.NA, "y"], True, ["x", "x", "xy"]),
-            (["x", pd.NA, "y"], False, ["x", pd.NA, pd.NA]),
-            ([pd.NA, pd.NA, pd.NA], True, ["", "", ""]),
-            ([pd.NA, pd.NA, pd.NA], False, [pd.NA, pd.NA, pd.NA]),
-        ],
-    )
-    def test_cumsum_pyarrow_strings(
-        self, pyarrow_string_dtype, data, skipna, expected_data
-    ):
-        # https://github.com/pandas-dev/pandas/pull/60633
-        ser = pd.Series(data, dtype=pyarrow_string_dtype)
-        expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
-        result = ser.cumsum(skipna=skipna)
-        tm.assert_series_equal(result, expected)
-
     @pytest.mark.parametrize(
         "data, op, skipna, expected_data",
         [
+            ([], "cumsum", True, []),
+            ([], "cumsum", False, []),
+            (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]),
+            (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]),
+            (["x", pd.NA, "y"], "cumsum", True, ["x", "x", "xy"]),
+            (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]),
+            ([pd.NA, "x", "y"], "cumsum", True, ["", "x", "xy"]),
+            ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cumsum", True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]),
             ([], "cummin", True, []),
             ([], "cummin", False, []),
             (["y", "z", "x"], "cummin", True, ["y", "y", "x"]),
@@ -277,13 +265,11 @@ def test_cumsum_pyarrow_strings(
             ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]),
         ],
     )
-    def test_cummin_cummax_pyarrow_strings(
+    def test_cum_methods_pyarrow_strings(
         self, pyarrow_string_dtype, data, op, skipna, expected_data
     ):
         # https://github.com/pandas-dev/pandas/pull/60633
         ser = pd.Series(data, dtype=pyarrow_string_dtype)
-        if expected_data is None:
-            expected_data = ser.dtype.na_value
         method = getattr(ser, op)
         expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
         result = method(skipna=skipna)

From 1f8e36ec6fc38b9eefecb32a0a635a1da8bb6bc1 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Wed, 1 Jan 2025 16:38:56 -0500
Subject: [PATCH 06/11] xfail test when there is no pyarrow

---
 pandas/tests/apply/test_str.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
index e224b07a1097b..ce71cfec535e4 100644
--- a/pandas/tests/apply/test_str.py
+++ b/pandas/tests/apply/test_str.py
@@ -4,7 +4,10 @@
 import numpy as np
 import pytest
 
-from pandas.compat import WASM
+from pandas.compat import (
+    HAS_PYARROW,
+    WASM,
+)
 
 from pandas.core.dtypes.common import is_number
 
@@ -159,10 +162,17 @@ def test_agg_cython_table_series(series, func, expected):
         ),
     ),
 )
-def test_agg_cython_table_transform_series(series, func, expected):
+def test_agg_cython_table_transform_series(request, series, func, expected):
     # GH21224
     # test transforming functions in
     # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
+    if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW:
+        request.applymarker(
+            pytest.mark.xfail(
+                raises=NotImplementedError,
+                reason="TODO(infer_string) cumsum not yet implemented for string",
+            )
+        )
     warn = None if isinstance(func, str) else FutureWarning
     with tm.assert_produces_warning(warn, match="is currently using Series.*"):
         result = series.agg(func)

From dd8fcbe1b6391b3b15c5549962a6ae5f65434c1a Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Wed, 1 Jan 2025 16:49:24 -0500
Subject: [PATCH 07/11] mypy fixups

---
 pandas/core/arrays/arrow/array.py     | 3 ++-
 pandas/tests/extension/test_string.py | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index bd22a78436067..6a937822fadd4 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -1704,7 +1704,8 @@ def _str_accumulate(
                 tail = pa.nulls(len(pa_array) - idx, type=pa_array.type)
                 pa_array = pa_array[:idx].combine_chunks()
 
-        pa_result = pa.array(np_func(pa_array), type=pa_array.type)
+        # error: Cannot call function of unknown type
+        pa_result = pa.array(np_func(pa_array), type=pa_array.type)  # type: ignore[operator]
 
         assert head is None or tail is None
         if head is not None:
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 6434487d67a4d..6ce48e434d329 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -24,6 +24,8 @@
 
 from pandas.compat import HAS_PYARROW
 
+from pandas.core.dtypes.base import StorageExtensionDtype
+
 import pandas as pd
 import pandas._testing as tm
 from pandas.api.types import is_string_dtype
@@ -193,6 +195,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
         )
 
     def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
+        assert isinstance(ser.dtype, StorageExtensionDtype)
         return ser.dtype.storage == "pyarrow" and op_name in [
             "cummin",
             "cummax",

From 46ff2c13136aa2e4ce86e9dc02b4ff6845580ab5 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Sun, 5 Jan 2025 14:55:41 -0500
Subject: [PATCH 08/11] Change logic & whatsnew

---
 doc/source/whatsnew/v2.3.0.rst         |  2 +-
 pandas/core/arrays/arrow/array.py      | 29 ++++++++++----------------
 pandas/tests/series/test_cumulative.py | 18 ++++++++--------
 3 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index b107a5d3ba100..9e0e095eb4de8 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -35,8 +35,8 @@ Other enhancements
 - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
   when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
   updated to work correctly with NumPy >= 2 (:issue:`57739`)
+- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
--
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_230.notable_bug_fixes:
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index bcf51579bc968..1c97e372ee7b5 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -1670,9 +1670,9 @@ def _str_accumulate(
             msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
             raise TypeError(msg)
 
-        # We may need to strip out leading / trailing NA values
-        head: pa.array | None = None
+        # We may need to strip out trailing NA values
         tail: pa.array | None = None
+        na_mask: pa.array | None = None
         pa_array = self._pa_array
         np_func = {
             "cumsum": np.cumsum,
@@ -1681,37 +1681,30 @@ def _str_accumulate(
         }[name]
 
         if self._hasna:
+            na_mask = pc.is_null(pa_array)
+            if pc.all(na_mask) == pa.scalar(True):
+                return type(self)(pa_array)
             if skipna:
                 if name == "cumsum":
                     pa_array = pc.fill_null(pa_array, "")
                 else:
-                    # After the first non-NA value we can retain the running min/max
-                    # by forward filling.
+                    # We can retain the running min/max by forward/backward filling.
                     pa_array = pc.fill_null_forward(pa_array)
-                    # But any leading NA values should result in "".
-                    nulls = pc.is_null(pa_array)
-                    idx = pc.index(nulls, False).as_py()
-                    if idx == -1:
-                        idx = len(pa_array)
-                    if idx > 0:
-                        head = pa.array([""] * idx, type=pa_array.type)
-                        pa_array = pa_array[idx:].combine_chunks()
+                    pa_array = pc.fill_null_backward(pa_array)
             else:
                 # When not skipping NA values, the result should be null from
                 # the first NA value onward.
-                nulls = pc.is_null(pa_array)
-                idx = pc.index(nulls, True).as_py()
+                idx = pc.index(na_mask, True).as_py()
                 tail = pa.nulls(len(pa_array) - idx, type=pa_array.type)
                 pa_array = pa_array[:idx].combine_chunks()
 
         # error: Cannot call function of unknown type
         pa_result = pa.array(np_func(pa_array), type=pa_array.type)  # type: ignore[operator]
 
-        assert head is None or tail is None
-        if head is not None:
-            pa_result = pa.concat_arrays([head, pa_result])
-        elif tail is not None:
+        if tail is not None:
             pa_result = pa.concat_arrays([pa_result, tail])
+        elif na_mask is not None:
+            pa_result = pc.if_else(na_mask, None, pa_result)
 
         result = type(self)(pa_result)
         return result
diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
index 610903d77512d..89882d9d797c5 100644
--- a/pandas/tests/series/test_cumulative.py
+++ b/pandas/tests/series/test_cumulative.py
@@ -237,31 +237,31 @@ def test_cumprod_timedelta(self):
             ([], "cumsum", False, []),
             (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]),
             (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]),
-            (["x", pd.NA, "y"], "cumsum", True, ["x", "x", "xy"]),
+            (["x", pd.NA, "y"], "cumsum", True, ["x", pd.NA, "xy"]),
             (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]),
-            ([pd.NA, "x", "y"], "cumsum", True, ["", "x", "xy"]),
+            ([pd.NA, "x", "y"], "cumsum", True, [pd.NA, "x", "xy"]),
             ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]),
-            ([pd.NA, pd.NA, pd.NA], "cumsum", True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], "cumsum", True, [pd.NA, pd.NA, pd.NA]),
             ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]),
             ([], "cummin", True, []),
             ([], "cummin", False, []),
             (["y", "z", "x"], "cummin", True, ["y", "y", "x"]),
             (["y", "z", "x"], "cummin", False, ["y", "y", "x"]),
-            (["y", pd.NA, "x"], "cummin", True, ["y", "y", "x"]),
+            (["y", pd.NA, "x"], "cummin", True, ["y", pd.NA, "x"]),
             (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]),
-            ([pd.NA, "y", "x"], "cummin", True, ["", "y", "x"]),
+            ([pd.NA, "y", "x"], "cummin", True, [pd.NA, "y", "x"]),
             ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]),
-            ([pd.NA, pd.NA, pd.NA], "cummin", True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], "cummin", True, [pd.NA, pd.NA, pd.NA]),
             ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]),
             ([], "cummax", True, []),
             ([], "cummax", False, []),
             (["x", "z", "y"], "cummax", True, ["x", "z", "z"]),
             (["x", "z", "y"], "cummax", False, ["x", "z", "z"]),
-            (["x", pd.NA, "y"], "cummax", True, ["x", "x", "y"]),
+            (["x", pd.NA, "y"], "cummax", True, ["x", pd.NA, "y"]),
             (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]),
-            ([pd.NA, "x", "y"], "cummax", True, ["", "x", "y"]),
+            ([pd.NA, "x", "y"], "cummax", True, [pd.NA, "x", "y"]),
             ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]),
-            ([pd.NA, pd.NA, pd.NA], "cummax", True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], "cummax", True, [pd.NA, pd.NA, pd.NA]),
             ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]),
         ],
     )

From f2b448db4196d592e6b2980f6f69198e559729e4 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Sun, 5 Jan 2025 14:56:24 -0500
Subject: [PATCH 09/11] Change logic & whatsnew

---
 pandas/core/arrays/arrow/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 1c97e372ee7b5..900548a239c8e 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -1696,7 +1696,7 @@ def _str_accumulate(
                 # the first NA value onward.
                 idx = pc.index(na_mask, True).as_py()
                 tail = pa.nulls(len(pa_array) - idx, type=pa_array.type)
-                pa_array = pa_array[:idx].combine_chunks()
+                pa_array = pa_array[:idx]
 
         # error: Cannot call function of unknown type
         pa_result = pa.array(np_func(pa_array), type=pa_array.type)  # type: ignore[operator]

From 4d11a1d68669457fbd3ed314aacd85594624c1ff Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Sun, 5 Jan 2025 14:58:55 -0500
Subject: [PATCH 10/11] Fix fixture

---
 pandas/conftest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 0b3654bbcc16e..4b632388a6f30 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1319,8 +1319,8 @@ def nullable_string_dtype(request):
 
 @pytest.fixture(
     params=[
-        pytest.param("str[pyarrow]", marks=td.skip_if_no("pyarrow")),
-        pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        pytest.param(pd.StringDtype("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
+        pytest.param(pd.StringDtype("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
     ]
 )
 def pyarrow_string_dtype(request):

From d3468ccd3f98e8e37163e832e2e02efd1b0572a1 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Sun, 5 Jan 2025 15:09:46 -0500
Subject: [PATCH 11/11] Fixup

---
 pandas/conftest.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 4b632388a6f30..f9c10a7758bd2 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1319,8 +1319,8 @@ def nullable_string_dtype(request):
 
 @pytest.fixture(
     params=[
-        pytest.param(pd.StringDtype("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
-        pytest.param(pd.StringDtype("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
+        pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
+        pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
     ]
 )
 def pyarrow_string_dtype(request):
@@ -1330,7 +1330,7 @@ def pyarrow_string_dtype(request):
     * 'str[pyarrow]'
     * 'string[pyarrow]'
     """
-    return request.param
+    return pd.StringDtype(*request.param)
 
 
 @pytest.fixture(