Skip to content

Commit 2718b4e

Browse files
authored
BUG: str.split for ArrowDtype with pat=None (#56332)
1 parent c99981a commit 2718b4e

File tree

3 files changed

+26
-8
lines changed

3 files changed

+26
-8
lines changed

doc/source/whatsnew/v2.1.4.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,12 @@ Bug fixes
2424
- Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`)
2525
- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`)
2626
- Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`)
27-
- Fixed bug in :meth:`.DataFrameGroupBy.min()` and :meth:`.DataFrameGroupBy.max()` not preserving extension dtype for empty object (:issue:`55619`)
27+
- Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`)
2828
- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
2929
- Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`)
3030
- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
3131
- Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`)
32+
- Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`)
3233
- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`)
3334

3435
.. ---------------------------------------------------------------------------

pandas/core/arrays/arrow/array.py

+15-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import functools
34
import operator
45
import re
56
import textwrap
@@ -2351,18 +2352,25 @@ def _str_split(
23512352
):
23522353
if n in {-1, 0}:
23532354
n = None
2354-
if regex:
2355-
split_func = pc.split_pattern_regex
2355+
if pat is None:
2356+
split_func = pc.utf8_split_whitespace
2357+
elif regex:
2358+
split_func = functools.partial(pc.split_pattern_regex, pattern=pat)
23562359
else:
2357-
split_func = pc.split_pattern
2358-
return type(self)(split_func(self._pa_array, pat, max_splits=n))
2360+
split_func = functools.partial(pc.split_pattern, pattern=pat)
2361+
return type(self)(split_func(self._pa_array, max_splits=n))
23592362

23602363
def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
23612364
if n in {-1, 0}:
23622365
n = None
2363-
return type(self)(
2364-
pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True)
2365-
)
2366+
if pat is None:
2367+
return type(self)(
2368+
pc.utf8_split_whitespace(self._pa_array, max_splits=n, reverse=True)
2369+
)
2370+
else:
2371+
return type(self)(
2372+
pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True)
2373+
)
23662374

23672375
def _str_translate(self, table: dict[int, str]):
23682376
predicate = lambda val: val.translate(table)

pandas/tests/extension/test_arrow.py

+9
Original file line numberDiff line numberDiff line change
@@ -2076,6 +2076,15 @@ def test_str_partition():
20762076
tm.assert_series_equal(result, expected)
20772077

20782078

2079+
@pytest.mark.parametrize("method", ["rsplit", "split"])
2080+
def test_str_split_pat_none(method):
2081+
# GH 56271
2082+
ser = pd.Series(["a1 cbc\nb", None], dtype=ArrowDtype(pa.string()))
2083+
result = getattr(ser.str, method)()
2084+
expected = pd.Series(ArrowExtensionArray(pa.array([["a1", "cbc", "b"], None])))
2085+
tm.assert_series_equal(result, expected)
2086+
2087+
20792088
def test_str_split():
20802089
# GH 52401
20812090
ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))

0 commit comments

Comments
 (0)