ENH: Implement cum* methods for PyArrow strings

rhshadrach · WillAyd · commit e837689e33ee · 2025-01-06T15:12:09.000-05:00
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1317,6 +1317,26 @@ def nullable_string_dtype(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        pytest.param(
+            pd.StringDtype("pyarrow", na_value=np.nan), marks=td.skip_if_no("pyarrow")
+        ),
+        pytest.param(
+            pd.StringDtype("pyarrow", na_value=pd.NA), marks=td.skip_if_no("pyarrow")
+        ),
+    ]
+)
+def pyarrow_string_dtype(request):
+    """
+    Parametrized fixture for string dtypes backed by Pyarrow.
+
+    * 'pd.StringDtype("pyarrow", na_value=np.nan)'
+    * 'pd.StringDtype("pyarrow", na_value=pd.NA)'
+    """
+    return request.param
+
+
 @pytest.fixture(
     params=[
         "python",
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -41,6 +41,7 @@
     is_list_like,
     is_numeric_dtype,
     is_scalar,
+    is_string_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
@@ -1619,6 +1620,9 @@ def _accumulate(
         ------
         NotImplementedError : subclass does not define accumulations
         """
+        if is_string_dtype(self):
+            return self._str_accumulate(name=name, skipna=skipna, **kwargs)
+
         pyarrow_name = {
             "cummax": "cumulative_max",
             "cummin": "cumulative_min",
@@ -1654,6 +1658,58 @@ def _accumulate(
 
         return type(self)(result)
 
+    def _str_accumulate(
+        self, name: str, *, skipna: bool = True, **kwargs
+    ) -> ArrowExtensionArray | ExtensionArray:
+        """
+        Accumulate implementation for strings, see `_accumulate` docstring for details.
+
+        pyarrow.compute does not implement these methods for strings.
+        """
+        if name == "cumprod":
+            msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
+            raise TypeError(msg)
+
+        # When present and skipna is False, we stop of at the first NA value.
+        # as the tail becomes all NA values.
+        head: pa.array | None = None
+        tail: pa.array | None = None
+        pa_array = self._pa_array
+        np_func = {
+            "cumsum": np.cumsum,
+            "cummin": np.minimum.accumulate,
+            "cummax": np.maximum.accumulate,
+        }[name]
+
+        if self._hasna:
+            if skipna:
+                if name == "cumsum":
+                    pa_array = pc.fill_null(pa_array, "")
+                else:
+                    pa_array = pc.fill_null_forward(pa_array)
+                    nulls = pc.is_null(pa_array)
+                    idx = pc.index(nulls, False).as_py()
+                    if idx == -1:
+                        idx = len(pa_array)
+                    if idx > 0:
+                        head = pa.array([""] * idx, type=pa_array.type)
+                        pa_array = pa_array[idx:].combine_chunks()
+            else:
+                nulls = pc.is_null(pa_array)
+                idx = pc.index(nulls, True).as_py()
+                tail = pa.nulls(len(pa_array) - idx, type=pa_array.type)
+                pa_array = pa_array[:idx].combine_chunks()
+
+        pa_result = pa.array(np_func(pa_array), type=pa_array.type)
+
+        if head is not None or tail is not None:
+            head = pa.array([], type=pa_array.type) if head is None else head
+            tail = pa.array([], type=pa_array.type) if tail is None else tail
+            pa_result = pa.concat_arrays([head, pa_result, tail])
+
+        result = type(self)(pa_result)
+        return result
+
     def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar:
         """
         Return a pyarrow scalar result of performing the reduction operation.
diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
@@ -159,17 +159,10 @@ def test_agg_cython_table_series(series, func, expected):
         ),
     ),
 )
-def test_agg_cython_table_transform_series(request, series, func, expected):
+def test_agg_cython_table_transform_series(series, func, expected):
     # GH21224
     # test transforming functions in
     # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
-    if series.dtype == "string" and func == "cumsum":
-        request.applymarker(
-            pytest.mark.xfail(
-                raises=(TypeError, NotImplementedError),
-                reason="TODO(infer_string) cumsum not yet implemented for string",
-            )
-        )
     warn = None if isinstance(func, str) else FutureWarning
     with tm.assert_produces_warning(warn, match="is currently using Series.*"):
         result = series.agg(func)
diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
@@ -227,3 +227,66 @@ def test_cumprod_timedelta(self):
         ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)])
         with pytest.raises(TypeError, match="cumprod not supported for Timedelta"):
             ser.cumprod()
+
+    @pytest.mark.parametrize(
+        "data, skipna, expected_data",
+        [
+            ([], True, []),
+            ([], False, []),
+            (["x", "z", "y"], True, ["x", "xz", "xzy"]),
+            (["x", "z", "y"], False, ["x", "xz", "xzy"]),
+            (["x", pd.NA, "y"], True, ["x", "x", "xy"]),
+            (["x", pd.NA, "y"], False, ["x", pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], False, [pd.NA, pd.NA, pd.NA]),
+        ],
+    )
+    def test_cumsum_pyarrow_strings(
+        self, pyarrow_string_dtype, data, skipna, expected_data
+    ):
+        ser = pd.Series(data, dtype=pyarrow_string_dtype)
+        expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
+        result = ser.cumsum(skipna=skipna)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "data, op, skipna, expected_data",
+        [
+            ([], "cummin", True, []),
+            ([], "cummin", False, []),
+            (["y", "z", "x"], "cummin", True, ["y", "y", "x"]),
+            (["y", "z", "x"], "cummin", False, ["y", "y", "x"]),
+            (["y", pd.NA, "x"], "cummin", True, ["y", "y", "x"]),
+            (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]),
+            ([pd.NA, "y", "x"], "cummin", True, ["", "y", "x"]),
+            ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cummin", True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]),
+            ([], "cummax", True, []),
+            ([], "cummax", False, []),
+            (["x", "z", "y"], "cummax", True, ["x", "z", "z"]),
+            (["x", "z", "y"], "cummax", False, ["x", "z", "z"]),
+            (["x", pd.NA, "y"], "cummax", True, ["x", "x", "y"]),
+            (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]),
+            ([pd.NA, "x", "y"], "cummax", True, ["", "x", "y"]),
+            ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cummax", True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]),
+        ],
+    )
+    def test_cummin_cummax_pyarrow_strings(
+        self, pyarrow_string_dtype, data, op, skipna, expected_data
+    ):
+        ser = pd.Series(data, dtype=pyarrow_string_dtype)
+        if expected_data is None:
+            expected_data = ser.dtype.na_value
+        method = getattr(ser, op)
+        expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
+        result = method(skipna=skipna)
+        tm.assert_series_equal(result, expected)
+
+    def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna):
+        ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype)
+        msg = f"operation 'cumprod' not supported for dtype '{ser.dtype}'"
+        with pytest.raises(TypeError, match=msg):
+            ser.cumprod(skipna=skipna)