Skip to content

Commit 50ac190

Browse files
BUG (string): Series.str.slice with negative step (#59724)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 715585d commit 50ac190

File tree

6 files changed

+28
-34
lines changed

6 files changed

+28
-34
lines changed

doc/source/whatsnew/v2.3.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,9 @@ Conversion
103103
Strings
104104
^^^^^^^
105105
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
106+
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
106107
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
107-
108+
-
108109

109110
Interval
110111
^^^^^^^^

pandas/core/arrays/_arrow_string_mixins.py

+23-9
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from pandas.compat import (
1313
pa_version_under10p1,
14+
pa_version_under11p0,
1415
pa_version_under13p0,
1516
pa_version_under17p0,
1617
)
@@ -22,10 +23,7 @@
2223
import pyarrow.compute as pc
2324

2425
if TYPE_CHECKING:
25-
from collections.abc import (
26-
Callable,
27-
Sized,
28-
)
26+
from collections.abc import Callable
2927

3028
from pandas._typing import (
3129
Scalar,
@@ -34,7 +32,7 @@
3432

3533

3634
class ArrowStringArrayMixin:
37-
_pa_array: Sized
35+
_pa_array: pa.ChunkedArray
3836

3937
def __init__(self, *args, **kwargs) -> None:
4038
raise NotImplementedError
@@ -96,13 +94,29 @@ def _str_get(self, i: int) -> Self:
9694
selected = pc.utf8_slice_codeunits(
9795
self._pa_array, start=start, stop=stop, step=step
9896
)
99-
null_value = pa.scalar(
100-
None,
101-
type=self._pa_array.type, # type: ignore[attr-defined]
102-
)
97+
null_value = pa.scalar(None, type=self._pa_array.type)
10398
result = pc.if_else(not_out_of_bounds, selected, null_value)
10499
return type(self)(result)
105100

101+
def _str_slice(
102+
self, start: int | None = None, stop: int | None = None, step: int | None = None
103+
) -> Self:
104+
if pa_version_under11p0:
105+
# GH#59724
106+
result = self._apply_elementwise(lambda val: val[start:stop:step])
107+
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
108+
if start is None:
109+
if step is not None and step < 0:
110+
# GH#59710
111+
start = -1
112+
else:
113+
start = 0
114+
if step is None:
115+
step = 1
116+
return type(self)(
117+
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
118+
)
119+
106120
def _str_slice_replace(
107121
self, start: int | None = None, stop: int | None = None, repl: str | None = None
108122
) -> Self:

pandas/core/arrays/arrow/array.py

-11
Original file line numberDiff line numberDiff line change
@@ -2394,17 +2394,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self:
23942394
result = self._apply_elementwise(predicate)
23952395
return type(self)(pa.chunked_array(result))
23962396

2397-
def _str_slice(
2398-
self, start: int | None = None, stop: int | None = None, step: int | None = None
2399-
) -> Self:
2400-
if start is None:
2401-
start = 0
2402-
if step is None:
2403-
step = 1
2404-
return type(self)(
2405-
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
2406-
)
2407-
24082397
def _str_len(self) -> Self:
24092398
return type(self)(pc.utf8_length(self._pa_array))
24102399

pandas/core/arrays/string_arrow.py

+1-13
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ def astype(self, dtype, copy: bool = True):
294294
_str_startswith = ArrowStringArrayMixin._str_startswith
295295
_str_endswith = ArrowStringArrayMixin._str_endswith
296296
_str_pad = ArrowStringArrayMixin._str_pad
297+
_str_slice = ArrowStringArrayMixin._str_slice
297298

298299
def _str_contains(
299300
self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
@@ -352,19 +353,6 @@ def _str_fullmatch(
352353
pat = f"{pat}$"
353354
return self._str_match(pat, case, flags, na)
354355

355-
def _str_slice(
356-
self, start: int | None = None, stop: int | None = None, step: int | None = None
357-
) -> Self:
358-
if stop is None:
359-
return super()._str_slice(start, stop, step)
360-
if start is None:
361-
start = 0
362-
if step is None:
363-
step = 1
364-
return type(self)(
365-
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
366-
)
367-
368356
def _str_len(self):
369357
result = pc.utf8_length(self._pa_array)
370358
return self._convert_int_result(result)

pandas/tests/extension/test_arrow.py

+1
Original file line numberDiff line numberDiff line change
@@ -2036,6 +2036,7 @@ def test_str_join_string_type():
20362036
[None, 2, None, ["ab", None]],
20372037
[None, 2, 1, ["ab", None]],
20382038
[1, 3, 1, ["bc", None]],
2039+
(None, None, -1, ["dcba", None]),
20392040
],
20402041
)
20412042
def test_str_slice(start, stop, step, exp):

pandas/tests/strings/test_strings.py

+1
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,7 @@ def test_pipe_failures(any_string_dtype):
394394
(2, 5, None, ["foo", "bar", np.nan, "baz"]),
395395
(0, 3, -1, ["", "", np.nan, ""]),
396396
(None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]),
397+
(None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]),
397398
(3, 10, 2, ["oto", "ato", np.nan, "aqx"]),
398399
(3, 0, -1, ["ofa", "aba", np.nan, "aba"]),
399400
],

0 commit comments

Comments
 (0)