diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 03355f655eb28..03b3a6b55dff6 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -103,8 +103,9 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) +- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) - +- Interval ^^^^^^^^ diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 950d4cd7cc92e..32fa5e7c383b5 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -11,6 +11,7 @@ from pandas.compat import ( pa_version_under10p1, + pa_version_under11p0, pa_version_under13p0, pa_version_under17p0, ) @@ -22,10 +23,7 @@ import pyarrow.compute as pc if TYPE_CHECKING: - from collections.abc import ( - Callable, - Sized, - ) + from collections.abc import Callable from pandas._typing import ( Scalar, @@ -34,7 +32,7 @@ class ArrowStringArrayMixin: - _pa_array: Sized + _pa_array: pa.ChunkedArray def __init__(self, *args, **kwargs) -> None: raise NotImplementedError @@ -96,13 +94,29 @@ def _str_get(self, i: int) -> Self: selected = pc.utf8_slice_codeunits( self._pa_array, start=start, stop=stop, step=step ) - null_value = pa.scalar( - None, - type=self._pa_array.type, # type: ignore[attr-defined] - ) + null_value = pa.scalar(None, type=self._pa_array.type) result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ) -> Self: + if pa_version_under11p0: + # GH#59724 + result = self._apply_elementwise(lambda val: val[start:stop:step]) + return type(self)(pa.chunked_array(result, type=self._pa_array.type)) + if start is None: + if step is not None and step < 0: + # GH#59710 + start = -1 + else: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_slice_replace( self, start: int | None = None, stop: int | None = None, repl: str | None = None ) -> Self: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 15f9ba611a642..8353ad53bb13c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2393,17 +2393,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self: result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ) -> Self: - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - def _str_len(self) -> Self: return type(self)(pc.utf8_length(self._pa_array)) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1e5adf106752f..5be293d73d446 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -293,6 +293,7 @@ def astype(self, dtype, copy: bool = True): _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith _str_pad = ArrowStringArrayMixin._str_pad + _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -351,19 +352,6 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ) -> Self: - if stop is None: - return super()._str_slice(start, stop, step) - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - def _str_len(self): result = pc.utf8_length(self._pa_array) return self._convert_int_result(result) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fc4f14882b9d7..f86d927ddda67 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2036,6 +2036,7 @@ def test_str_join_string_type(): [None, 2, None, ["ab", None]], [None, 2, 1, ["ab", None]], [1, 3, 1, ["bc", None]], + (None, None, -1, ["dcba", None]), ], ) def test_str_slice(start, stop, step, exp): diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1ce46497c3c22..4995b448f7e94 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -394,6 +394,7 @@ def test_pipe_failures(any_string_dtype): (2, 5, None, ["foo", "bar", np.nan, "baz"]), (0, 3, -1, ["", "", np.nan, ""]), (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]), + (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]), (3, 10, 2, ["oto", "ato", np.nan, "aqx"]), (3, 0, -1, ["ofa", "aba", np.nan, "aba"]), ],