Skip to content

BUG (string): Series.str.slice with negative step #59724

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Sep 10, 2024
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,9 @@ Conversion
Strings
^^^^^^^
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)

-

Interval
^^^^^^^^
Expand Down
32 changes: 23 additions & 9 deletions pandas/core/arrays/_arrow_string_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
pa_version_under13p0,
pa_version_under17p0,
)
Expand All @@ -22,10 +23,7 @@
import pyarrow.compute as pc

if TYPE_CHECKING:
from collections.abc import (
Callable,
Sized,
)
from collections.abc import Callable

from pandas._typing import (
Scalar,
Expand All @@ -34,7 +32,7 @@


class ArrowStringArrayMixin:
_pa_array: Sized
_pa_array: pa.ChunkedArray

def __init__(self, *args, **kwargs) -> None:
raise NotImplementedError
Expand Down Expand Up @@ -96,13 +94,29 @@ def _str_get(self, i: int) -> Self:
selected = pc.utf8_slice_codeunits(
self._pa_array, start=start, stop=stop, step=step
)
null_value = pa.scalar(
None,
type=self._pa_array.type, # type: ignore[attr-defined]
)
null_value = pa.scalar(None, type=self._pa_array.type)
result = pc.if_else(not_out_of_bounds, selected, null_value)
return type(self)(result)

def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
) -> Self:
if pa_version_under11p0:
# GH#59724
result = self._apply_elementwise(lambda val: val[start:stop:step])
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
if start is None:
if step is not None and step < 0:
# GH#59710
start = -1
else:
start = 0
if step is None:
step = 1
return type(self)(
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
)

def _str_slice_replace(
self, start: int | None = None, stop: int | None = None, repl: str | None = None
) -> Self:
Expand Down
11 changes: 0 additions & 11 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2393,17 +2393,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self:
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))

def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
) -> Self:
if start is None:
start = 0
if step is None:
step = 1
return type(self)(
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
)

def _str_len(self) -> Self:
return type(self)(pc.utf8_length(self._pa_array))

Expand Down
14 changes: 1 addition & 13 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ def astype(self, dtype, copy: bool = True):
_str_startswith = ArrowStringArrayMixin._str_startswith
_str_endswith = ArrowStringArrayMixin._str_endswith
_str_pad = ArrowStringArrayMixin._str_pad
_str_slice = ArrowStringArrayMixin._str_slice

def _str_contains(
self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
Expand Down Expand Up @@ -351,19 +352,6 @@ def _str_fullmatch(
pat = f"{pat}$"
return self._str_match(pat, case, flags, na)

def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
) -> Self:
if stop is None:
return super()._str_slice(start, stop, step)
if start is None:
start = 0
if step is None:
step = 1
return type(self)(
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
)

def _str_len(self):
result = pc.utf8_length(self._pa_array)
return self._convert_int_result(result)
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2036,6 +2036,7 @@ def test_str_join_string_type():
[None, 2, None, ["ab", None]],
[None, 2, 1, ["ab", None]],
[1, 3, 1, ["bc", None]],
(None, None, -1, ["dcba", None]),
],
)
def test_str_slice(start, stop, step, exp):
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/strings/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,7 @@ def test_pipe_failures(any_string_dtype):
(2, 5, None, ["foo", "bar", np.nan, "baz"]),
(0, 3, -1, ["", "", np.nan, ""]),
(None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]),
(None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]),
(3, 10, 2, ["oto", "ato", np.nan, "aqx"]),
(3, 0, -1, ["ofa", "aba", np.nan, "aba"]),
],
Expand Down
Loading