pandas-dev · jreback · Nov 4, 2021 · Oct 15, 2021 · Oct 15, 2021 · Oct 15, 2021
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -179,6 +179,7 @@ Other enhancements
 - :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`)
 - The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`)
 - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`)
+- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`)
 - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
 -
 

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -657,7 +657,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
 
     Parameters
     ----------
-    pat : str, optional
+    pat : str, or compiled regex optional
-    pat : str, or compiled regex optional
+    pat : str or compiled regex, optional
-    pat : str, or compiled regex optional
+    pat : str or compiled regex, optional
         String or regular expression to split on.
         If not specified, split on whitespace.
     n : int, default -1 (all)
@@ -669,6 +669,18 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
         * If ``True``, return DataFrame/MultiIndex expanding dimensionality.
         * If ``False``, return Series/Index, containing lists of strings.
 
+    regex : bool, default None
+        Determines whether to handle the pattern as a regular expression.
+        If ``pat`` is a compiled regular expression, it is interpreted as a
+        regular expression regardless of ``regex``
+
+        * If ``True``, assumes the passed-in pattern is a regular expression
+        * If ``False``, treats the pattern as a literal string.
+        * If ``None`` and the pattern length is 1, treats the pattern as a
+        literal string.
+        * If ``None`` and the pattern length is not 1, treats the pattern as
+        a regular expression.
+
     Returns
     -------
     Series, Index, DataFrame or MultiIndex
@@ -770,22 +782,44 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
     1  https://docs.python.org/3/tutorial  index.html
     2                                 NaN         NaN
 
-    Remember to escape special characters when explicitly using regular
-    expressions.
+    Remember to escape special characters when explicitly using regular expressions.
+    When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled
+    as a regex only if ``len(pat) != 1``.
 
-    >>> s = pd.Series(["1+1=2"])
-    >>> s
-    0    1+1=2
-    dtype: object
-    >>> s.str.split(r"\+|=", expand=True)
-         0    1    2
-    0    1    1    2
+    >>> s = pd.Series(['fooojpgbar.jpg'])
+    >>> s.str.split(r".", expand=True)
+                0    1
+    0  fooojpgbar  jpg
+    >>> s.str.split(r"\.jpg", expand=True)
+                0 1
+    0  fooojpgbar
+    >>> s.str.split(r".jpg", expand=True)
+         0    1 2
+    0  foo  bar
+
+    When ``regex=True``, `pat` is interpreted as a regex
+
+    >>> s.str.split(r"\.jpg", regex=True, expand=True)
+                0 1
+    0  fooojpgbar
+
+    When ``regex=False``, `pat` is interpreted as the string itself
+
+    >>> s.str.split(r"\.jpg", regex=False, expand=True)
+                    0
+    0  fooojpgbar.jpg
     """
 
     @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})
     @forbid_nonstring_types(["bytes"])
-    def split(self, pat=None, n=-1, expand=False):
-        result = self._data.array._str_split(pat, n, expand)
+    def split(
+        self,
+        pat: str | re.Pattern | None = None,
+        n=-1,
+        expand=False,
+        regex: bool | None = None,
+    ):
+        result = self._data.array._str_split(pat, n, expand, regex)
         return self._wrap_result(result, returns_string=expand, expand=expand)
 
     @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})

diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
@@ -308,21 +308,38 @@ def f(x):
 
         return self._str_map(f)
 
-    def _str_split(self, pat=None, n=-1, expand=False):
+    def _str_split(
+        self,
+        pat: str | re.Pattern | None = None,
+        n=-1,
+        expand=False,
+        regex: bool | None = None,
+    ):
 if case is False: 
     # add case flag, if provided 
     flags |= re.IGNORECASE 
 if regex or flags or callable(repl): 
     if not isinstance(pat, re.Pattern): 
         if regex is False: 
             pat = re.escape(pat) 
         pat = re.compile(pat, flags=flags) 
 if len(pat) == 1: 
     if n is None or n == 0: 
         n = -1 
     f = lambda x: x.split(pat, n) 
 else: 
     if n is None or n == -1: 
         n = 0 
     regex = re.compile(pat) 
     f = lambda x: regex.split(x, maxsplit=n) 
 if case is False: 
     # add case flag, if provided 
     flags |= re.IGNORECASE 
  
 if regex or flags or callable(repl): 
     if not isinstance(pat, re.Pattern): 
         if regex is False: 
             pat = re.escape(pat) 
         pat = re.compile(pat, flags=flags) 
 if len(pat) == 1: 
     if n is None or n == 0: 
         n = -1 
     f = lambda x: x.split(pat, n) 
 else: 
     if n is None or n == -1: 
         n = 0 
     regex = re.compile(pat) 
     f = lambda x: regex.split(x, maxsplit=n) 
         if pat is None:
             if n is None or n == 0:
                 n = -1
             f = lambda x: x.split(pat, n)
         else:
-            if len(pat) == 1:
-                if n is None or n == 0:
-                    n = -1
-                f = lambda x: x.split(pat, n)
+            new_pat: str | re.Pattern
+            if regex is True or isinstance(pat, re.Pattern):
+                new_pat = re.compile(pat)
+            elif regex is False:
+                new_pat = pat
+            # regex is None so link to old behavior #43563
             else:
+                if len(pat) == 1:
+                    new_pat = pat
+                else:
+                    new_pat = re.compile(pat)
+
+            if isinstance(new_pat, re.Pattern):
                 if n is None or n == -1:
                     n = 0
-                regex = re.compile(pat)
-                f = lambda x: regex.split(x, maxsplit=n)
+                f = lambda x: new_pat.split(x, maxsplit=n)
+            else:
+                if n is None or n == 0:
+                    n = -1
+                f = lambda x: x.split(pat, n)
         return self._str_map(f, dtype=object)
 
     def _str_rsplit(self, pat=None, n=-1):

diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
@@ -34,6 +34,27 @@ def test_split(any_string_dtype):
     exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
     tm.assert_series_equal(result, exp)
 
+    # explicit regex = True split
+    values = Series("qweqwejpgqweqwe.jpg", dtype=any_string_dtype)
+    result = values.str.split(r"\.jpg", regex=True)
+    exp = Series([["qweqwejpgqweqwe", ""]])
+    tm.assert_series_equal(result, exp)
+
+    # explicit regex = False split
+    result = values.str.split(r"\.jpg", regex=False)
+    exp = Series([["qweqwejpgqweqwe.jpg"]])
+    tm.assert_series_equal(result, exp)
+
+    # non explicit regex split, pattern length == 1
+    result = values.str.split(r".")
+    exp = Series([["qweqwejpgqweqwe", "jpg"]])
+    tm.assert_series_equal(result, exp)
+
+    # non explicit regex split, pattern length != 1
+    result = values.str.split(r".jpg")
+    exp = Series([["qweqw", "qweqwe", ""]])
+    tm.assert_series_equal(result, exp)
+
 
 def test_split_object_mixed():
     mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])