Skip to content

Commit 14dc708

Browse files
authored
Backport PR #56332: BUG: str.split for ArrowDtype with pat=None (#56348)
* Backport PR #56332: BUG: str.split for ArrowDtype with pat=None * remove mode note
1 parent 3ee9b3e commit 14dc708

File tree

3 files changed

+26
-8
lines changed

3 files changed

+26
-8
lines changed

doc/source/whatsnew/v2.1.4.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,11 @@ Bug fixes
2525
- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55753`)
2626
- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`)
2727
- Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`)
28-
- Fixed bug in :meth:`.DataFrameGroupBy.min()` and :meth:`.DataFrameGroupBy.max()` not preserving extension dtype for empty object (:issue:`55619`)
28+
- Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`)
2929
- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
3030
- Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`)
3131
- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
32+
- Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`)
3233
- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`)
3334
-
3435

pandas/core/arrays/arrow/array.py

+15-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import functools
34
import operator
45
import re
56
import textwrap
@@ -2320,18 +2321,25 @@ def _str_split(
23202321
):
23212322
if n in {-1, 0}:
23222323
n = None
2323-
if regex:
2324-
split_func = pc.split_pattern_regex
2324+
if pat is None:
2325+
split_func = pc.utf8_split_whitespace
2326+
elif regex:
2327+
split_func = functools.partial(pc.split_pattern_regex, pattern=pat)
23252328
else:
2326-
split_func = pc.split_pattern
2327-
return type(self)(split_func(self._pa_array, pat, max_splits=n))
2329+
split_func = functools.partial(pc.split_pattern, pattern=pat)
2330+
return type(self)(split_func(self._pa_array, max_splits=n))
23282331

23292332
def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
23302333
if n in {-1, 0}:
23312334
n = None
2332-
return type(self)(
2333-
pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True)
2334-
)
2335+
if pat is None:
2336+
return type(self)(
2337+
pc.utf8_split_whitespace(self._pa_array, max_splits=n, reverse=True)
2338+
)
2339+
else:
2340+
return type(self)(
2341+
pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True)
2342+
)
23352343

23362344
def _str_translate(self, table: dict[int, str]):
23372345
predicate = lambda val: val.translate(table)

pandas/tests/extension/test_arrow.py

+9
Original file line numberDiff line numberDiff line change
@@ -2202,6 +2202,15 @@ def test_str_partition():
22022202
tm.assert_series_equal(result, expected)
22032203

22042204

2205+
@pytest.mark.parametrize("method", ["rsplit", "split"])
2206+
def test_str_split_pat_none(method):
2207+
# GH 56271
2208+
ser = pd.Series(["a1 cbc\nb", None], dtype=ArrowDtype(pa.string()))
2209+
result = getattr(ser.str, method)()
2210+
expected = pd.Series(ArrowExtensionArray(pa.array([["a1", "cbc", "b"], None])))
2211+
tm.assert_series_equal(result, expected)
2212+
2213+
22052214
def test_str_split():
22062215
# GH 52401
22072216
ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))

0 commit comments

Comments
 (0)