diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d9d2330f8f11b..84b26303e0459 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -526,6 +526,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.str.split` would not treat ``pat`` as regex when ``regex=None`` for series having ``pd.ArrowDtype(pa.string())`` dtype (:issue:`58321`) - Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`) - diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index dd9276179cf4d..fbe51dcc4a919 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -920,6 +920,9 @@ def split( ) if is_re(pat): regex = True + elif isinstance(pat, str) and regex is None: + # regex is None so link to old behavior #43563 + regex = len(pat) != 1 result = self._data.array._str_split(pat, n, expand, regex) if self._data.dtype == "category": dtype = self._data.dtype.categories.dtype diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f2e9d2321f33e..562d33edbeab7 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2296,6 +2296,27 @@ def test_str_split_pat_none(method): tm.assert_series_equal(result, expected) +def test_str_split_regex_explicit(): + # GH 58321 + # adapted from tests/strings/test_split_partition.py + values = pd.Series("xxxjpgzzz.jpg", dtype=ArrowDtype(pa.string())) + + # explicit regex = False split + result = values.str.split(r"\.jpg", regex=False) + exp = pd.Series(ArrowExtensionArray(pa.array([["xxxjpgzzz.jpg"]]))) + tm.assert_series_equal(result, exp) + + # non explicit regex split, pattern length == 1 + result = values.str.split(r".") + exp = pd.Series(ArrowExtensionArray(pa.array([["xxxjpgzzz", "jpg"]]))) + tm.assert_series_equal(result, exp) + + # non explicit regex split, pattern length != 1 + result = values.str.split(r".jpg") + exp = pd.Series(ArrowExtensionArray(pa.array([["xx", "zzz", ""]]))) + tm.assert_series_equal(result, exp) + + def test_str_split(): # GH 52401 ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))