pandas-dev · yuanx749 · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 30, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -397,6 +397,7 @@ Conversion
 
 Strings
 ^^^^^^^
+- Bug in :meth:`Series.str.split` would not treat ``pat`` as regex when ``regex=None`` for series having ``pd.ArrowDtype(pa.string())`` dtype (:issue:`58321`)
 - Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`)
 -
 

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -92,7 +92,6 @@
 except ImportError:
     has_pyarrow = False
 else:
-    del pa
     has_pyarrow = True
 
 import zoneinfo
@@ -1367,6 +1366,25 @@ def any_string_dtype(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        "object",
+        "string[python]",
+        pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
+        pytest.param(pd.ArrowDtype(pa.string()), marks=td.skip_if_no("pyarrow")),
+    ]
+)
+def any_string_dtype_2(request):
+    """
+    Parametrized fixture for string dtypes.
+    * 'object'
+    * 'string[python]'
+    * 'string[pyarrow]'
+    """
+    return request.param
+
+
 @pytest.fixture(params=tm.DATETIME64_DTYPES)
 def datetime64_dtype(request):
     """

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -910,6 +910,9 @@ def split(
             )
         if is_re(pat):
             regex = True
+        elif pat is not None and regex is None:
+            # regex is None so link to old behavior #43563
+            regex = len(pat) != 1
         result = self._data.array._str_split(pat, n, expand, regex)
         if self._data.dtype == "category":
             dtype = self._data.dtype.categories.dtype

diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
@@ -339,14 +339,8 @@ def _str_split(
             new_pat: str | re.Pattern
             if regex is True or isinstance(pat, re.Pattern):
                 new_pat = re.compile(pat)
-            elif regex is False:
-                new_pat = pat
-            # regex is None so link to old behavior #43563
             else:
-                if len(pat) == 1:
-                    new_pat = pat
-                else:
-                    new_pat = re.compile(pat)
+                new_pat = pat
 
             if isinstance(new_pat, re.Pattern):
                 if n is None or n == -1:

diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
@@ -17,6 +17,10 @@
     object_pyarrow_numpy,
 )
 
+pa = pytest.importorskip("pyarrow")
+
+from pandas.core.arrays.arrow.array import ArrowExtensionArray
+
 
 @pytest.mark.parametrize("method", ["split", "rsplit"])
 def test_split(any_string_dtype, method):
@@ -59,27 +63,39 @@ def test_split_regex(any_string_dtype):
     tm.assert_series_equal(result, exp)
 
 
-def test_split_regex_explicit(any_string_dtype):
+def test_split_regex_explicit(any_string_dtype_2):
     # explicit regex = True split with compiled regex
     regex_pat = re.compile(r".jpg")
-    values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
-    result = values.str.split(regex_pat)
-    exp = Series([["xx", "zzz", ""]])
-    tm.assert_series_equal(result, exp)
+    values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype_2)
+
+    if not isinstance(any_string_dtype_2, pd.ArrowDtype):
+        # ArrowDtype does not support compiled regex
+        result = values.str.split(regex_pat)
+        exp = Series([["xx", "zzz", ""]])
+        tm.assert_series_equal(result, exp)
 
     # explicit regex = False split
     result = values.str.split(r"\.jpg", regex=False)
-    exp = Series([["xxxjpgzzz.jpg"]])
+    if not isinstance(any_string_dtype_2, pd.ArrowDtype):
+        exp = Series([["xxxjpgzzz.jpg"]])
+    else:
+        exp = Series(ArrowExtensionArray(pa.array([["xxxjpgzzz.jpg"]])))
     tm.assert_series_equal(result, exp)
 
     # non explicit regex split, pattern length == 1
     result = values.str.split(r".")
-    exp = Series([["xxxjpgzzz", "jpg"]])
+    if not isinstance(any_string_dtype_2, pd.ArrowDtype):
+        exp = Series([["xxxjpgzzz", "jpg"]])
+    else:
+        exp = Series(ArrowExtensionArray(pa.array([["xxxjpgzzz", "jpg"]])))
     tm.assert_series_equal(result, exp)
 
     # non explicit regex split, pattern length != 1
     result = values.str.split(r".jpg")
-    exp = Series([["xx", "zzz", ""]])
+    if not isinstance(any_string_dtype_2, pd.ArrowDtype):
+        exp = Series([["xx", "zzz", ""]])
+    else:
+        exp = Series(ArrowExtensionArray(pa.array([["xx", "zzz", ""]])))
     tm.assert_series_equal(result, exp)
 
     # regex=False with pattern compiled regex raises error
-Original file line number
+Diff line change
@@ Expand Up / @@ -397,6 +397,7 @@ Conversion @@
     Strings
     ^^^^^^^
+    - Bug in :meth:`Series.str.split` would not treat ``pat`` as regex when ``regex=None`` for series having ``pd.ArrowDtype(pa.string())`` dtype (:issue:`58321`)
     - Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`)
     -
@@ Expand Down @@