Skip to content

Commit 418f890

Browse files
BUG (string): Series.str.slice with negative step (pandas-dev#59724)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent ca24b42 commit 418f890

File tree

6 files changed

+28
-33
lines changed

6 files changed

+28
-33
lines changed

doc/source/whatsnew/v2.3.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,9 @@ Conversion
103103
Strings
104104
^^^^^^^
105105
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
106+
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
106107
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
107-
108+
-
108109

109110
Interval
110111
^^^^^^^^

pandas/core/arrays/_arrow_string_mixins.py

+23-8
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from pandas.compat import (
1313
pa_version_under10p1,
14+
pa_version_under11p0,
1415
pa_version_under13p0,
1516
pa_version_under17p0,
1617
)
@@ -22,16 +23,13 @@
2223
import pyarrow.compute as pc
2324

2425
if TYPE_CHECKING:
25-
from collections.abc import (
26-
Callable,
27-
Sized,
28-
)
26+
from collections.abc import Callable
2927

3028
from pandas._typing import Scalar
3129

3230

3331
class ArrowStringArrayMixin:
34-
_pa_array: Sized
32+
_pa_array: pa.ChunkedArray
3533

3634
def __init__(self, *args, **kwargs) -> None:
3735
raise NotImplementedError
@@ -93,12 +91,29 @@ def _str_get(self, i: int):
9391
selected = pc.utf8_slice_codeunits(
9492
self._pa_array, start=start, stop=stop, step=step
9593
)
96-
null_value = pa.scalar(
97-
None, type=self._pa_array.type # type: ignore[attr-defined]
98-
)
94+
null_value = pa.scalar(None, type=self._pa_array.type)
9995
result = pc.if_else(not_out_of_bounds, selected, null_value)
10096
return type(self)(result)
10197

98+
def _str_slice(
99+
self, start: int | None = None, stop: int | None = None, step: int | None = None
100+
):
101+
if pa_version_under11p0:
102+
# GH#59724
103+
result = self._apply_elementwise(lambda val: val[start:stop:step])
104+
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
105+
if start is None:
106+
if step is not None and step < 0:
107+
# GH#59710
108+
start = -1
109+
else:
110+
start = 0
111+
if step is None:
112+
step = 1
113+
return type(self)(
114+
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
115+
)
116+
102117
def _str_slice_replace(
103118
self, start: int | None = None, stop: int | None = None, repl: str | None = None
104119
):

pandas/core/arrays/arrow/array.py

-11
Original file line numberDiff line numberDiff line change
@@ -2368,17 +2368,6 @@ def _str_rpartition(self, sep: str, expand: bool):
23682368
result = self._apply_elementwise(predicate)
23692369
return type(self)(pa.chunked_array(result))
23702370

2371-
def _str_slice(
2372-
self, start: int | None = None, stop: int | None = None, step: int | None = None
2373-
):
2374-
if start is None:
2375-
start = 0
2376-
if step is None:
2377-
step = 1
2378-
return type(self)(
2379-
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
2380-
)
2381-
23822371
def _str_len(self):
23832372
return type(self)(pc.utf8_length(self._pa_array))
23842373

pandas/core/arrays/string_arrow.py

+1-13
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ def _data(self):
295295
_str_startswith = ArrowStringArrayMixin._str_startswith
296296
_str_endswith = ArrowStringArrayMixin._str_endswith
297297
_str_pad = ArrowStringArrayMixin._str_pad
298+
_str_slice = ArrowStringArrayMixin._str_slice
298299

299300
def _str_contains(
300301
self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
@@ -351,19 +352,6 @@ def _str_fullmatch(
351352
pat = f"{pat}$"
352353
return self._str_match(pat, case, flags, na)
353354

354-
def _str_slice(
355-
self, start: int | None = None, stop: int | None = None, step: int | None = None
356-
):
357-
if stop is None:
358-
return super()._str_slice(start, stop, step)
359-
if start is None:
360-
start = 0
361-
if step is None:
362-
step = 1
363-
return type(self)(
364-
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
365-
)
366-
367355
def _str_len(self):
368356
result = pc.utf8_length(self._pa_array)
369357
return self._convert_int_result(result)

pandas/tests/extension/test_arrow.py

+1
Original file line numberDiff line numberDiff line change
@@ -2018,6 +2018,7 @@ def test_str_join_string_type():
20182018
[None, 2, None, ["ab", None]],
20192019
[None, 2, 1, ["ab", None]],
20202020
[1, 3, 1, ["bc", None]],
2021+
(None, None, -1, ["dcba", None]),
20212022
],
20222023
)
20232024
def test_str_slice(start, stop, step, exp):

pandas/tests/strings/test_strings.py

+1
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ def test_pipe_failures(any_string_dtype):
393393
(2, 5, None, ["foo", "bar", np.nan, "baz"]),
394394
(0, 3, -1, ["", "", np.nan, ""]),
395395
(None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]),
396+
(None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]),
396397
(3, 10, 2, ["oto", "ato", np.nan, "aqx"]),
397398
(3, 0, -1, ["ofa", "aba", np.nan, "aba"]),
398399
],

0 commit comments

Comments
 (0)