Skip to content

Commit 669acb4

Browse files
authored
ENH: added regex argument to Series.str.split (#44185)
1 parent 4d507b0 commit 669acb4

File tree

4 files changed

+139
-22
lines changed

4 files changed

+139
-22
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ Other enhancements
180180
- :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`)
181181
- The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`)
182182
- Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`)
183+
- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`)
183184
- :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
184185
-
185186

pandas/core/strings/accessor.py

+75-15
Original file line numberDiff line numberDiff line change
@@ -659,11 +659,11 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
659659
Split strings around given separator/delimiter.
660660
661661
Splits the string in the Series/Index from the %(side)s,
662-
at the specified delimiter string. Equivalent to :meth:`str.%(method)s`.
662+
at the specified delimiter string.
663663
664664
Parameters
665665
----------
666-
pat : str, optional
666+
pat : str or compiled regex, optional
667667
String or regular expression to split on.
668668
If not specified, split on whitespace.
669669
n : int, default -1 (all)
@@ -672,14 +672,30 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
672672
expand : bool, default False
673673
Expand the split strings into separate columns.
674674
675-
* If ``True``, return DataFrame/MultiIndex expanding dimensionality.
676-
* If ``False``, return Series/Index, containing lists of strings.
675+
- If ``True``, return DataFrame/MultiIndex expanding dimensionality.
676+
- If ``False``, return Series/Index, containing lists of strings.
677+
678+
regex : bool, default None
679+
Determines if the passed-in pattern is a regular expression:
680+
681+
- If ``True``, assumes the passed-in pattern is a regular expression
682+
- If ``False``, treats the pattern as a literal string.
683+
- If ``None`` and `pat` length is 1, treats `pat` as a literal string.
684+
- If ``None`` and `pat` length is not 1, treats `pat` as a regular expression.
685+
- Cannot be set to False if `pat` is a compiled regex
686+
687+
.. versionadded:: 1.4.0
677688
678689
Returns
679690
-------
680691
Series, Index, DataFrame or MultiIndex
681692
Type matches caller unless ``expand=True`` (see Notes).
682693
694+
Raises
695+
------
696+
ValueError
697+
* if `regex` is False and `pat` is a compiled regex
698+
683699
See Also
684700
--------
685701
Series.str.split : Split strings around given separator/delimiter.
@@ -702,6 +718,9 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
702718
If using ``expand=True``, Series and Index callers return DataFrame and
703719
MultiIndex objects, respectively.
704720
721+
Use of `regex=False` with a `pat` as a compiled regex will raise
722+
an error.
723+
705724
Examples
706725
--------
707726
>>> s = pd.Series(
@@ -776,22 +795,63 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
776795
1 https://docs.python.org/3/tutorial index.html
777796
2 NaN NaN
778797
779-
Remember to escape special characters when explicitly using regular
780-
expressions.
798+
Remember to escape special characters when explicitly using regular expressions.
781799
782-
>>> s = pd.Series(["1+1=2"])
783-
>>> s
784-
0 1+1=2
785-
dtype: object
786-
>>> s.str.split(r"\+|=", expand=True)
787-
0 1 2
788-
0 1 1 2
800+
>>> s = pd.Series(["foo and bar plus baz"])
801+
>>> s.str.split(r"and|plus", expand=True)
802+
0 1 2
803+
0 foo bar baz
804+
805+
Regular expressions can be used to handle urls or file names.
806+
When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled
807+
as a regex only if ``len(pat) != 1``.
808+
809+
>>> s = pd.Series(['foojpgbar.jpg'])
810+
>>> s.str.split(r".", expand=True)
811+
0 1
812+
0 foojpgbar jpg
813+
814+
>>> s.str.split(r"\.jpg", expand=True)
815+
0 1
816+
0 foojpgbar
817+
818+
When ``regex=True``, `pat` is interpreted as a regex
819+
820+
>>> s.str.split(r"\.jpg", regex=True, expand=True)
821+
0 1
822+
0 foojpgbar
823+
824+
A compiled regex can be passed as `pat`
825+
826+
>>> import re
827+
>>> s.str.split(re.compile(r"\.jpg"), expand=True)
828+
0 1
829+
0 foojpgbar
830+
831+
When ``regex=False``, `pat` is interpreted as the string itself
832+
833+
>>> s.str.split(r"\.jpg", regex=False, expand=True)
834+
0
835+
0 foojpgbar.jpg
789836
"""
790837

791838
@Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})
792839
@forbid_nonstring_types(["bytes"])
793-
def split(self, pat=None, n=-1, expand=False):
794-
result = self._data.array._str_split(pat, n, expand)
840+
def split(
841+
self,
842+
pat: str | re.Pattern | None = None,
843+
n=-1,
844+
expand=False,
845+
*,
846+
regex: bool | None = None,
847+
):
848+
if regex is False and is_re(pat):
849+
raise ValueError(
850+
"Cannot use a compiled regex as replacement pattern with regex=False"
851+
)
852+
if is_re(pat):
853+
regex = True
854+
result = self._data.array._str_split(pat, n, expand, regex)
795855
return self._wrap_result(result, returns_string=expand, expand=expand)
796856

797857
@Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})

pandas/core/strings/object_array.py

+24-7
Original file line numberDiff line numberDiff line change
@@ -308,21 +308,38 @@ def f(x):
308308

309309
return self._str_map(f)
310310

311-
def _str_split(self, pat=None, n=-1, expand=False):
311+
def _str_split(
312+
self,
313+
pat: str | re.Pattern | None = None,
314+
n=-1,
315+
expand=False,
316+
regex: bool | None = None,
317+
):
312318
if pat is None:
313319
if n is None or n == 0:
314320
n = -1
315321
f = lambda x: x.split(pat, n)
316322
else:
317-
if len(pat) == 1:
318-
if n is None or n == 0:
319-
n = -1
320-
f = lambda x: x.split(pat, n)
323+
new_pat: str | re.Pattern
324+
if regex is True or isinstance(pat, re.Pattern):
325+
new_pat = re.compile(pat)
326+
elif regex is False:
327+
new_pat = pat
328+
# regex is None so link to old behavior #43563
321329
else:
330+
if len(pat) == 1:
331+
new_pat = pat
332+
else:
333+
new_pat = re.compile(pat)
334+
335+
if isinstance(new_pat, re.Pattern):
322336
if n is None or n == -1:
323337
n = 0
324-
regex = re.compile(pat)
325-
f = lambda x: regex.split(x, maxsplit=n)
338+
f = lambda x: new_pat.split(x, maxsplit=n)
339+
else:
340+
if n is None or n == 0:
341+
n = -1
342+
f = lambda x: x.split(pat, n)
326343
return self._str_map(f, dtype=object)
327344

328345
def _str_rsplit(self, pat=None, n=-1):

pandas/tests/strings/test_split_partition.py

+39
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from datetime import datetime
2+
import re
23

34
import numpy as np
45
import pytest
@@ -35,6 +36,44 @@ def test_split(any_string_dtype):
3536
tm.assert_series_equal(result, exp)
3637

3738

39+
def test_split_regex(any_string_dtype):
40+
# GH 43563
41+
# explicit regex = True split
42+
values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
43+
result = values.str.split(r"\.jpg", regex=True)
44+
exp = Series([["xxxjpgzzz", ""]])
45+
tm.assert_series_equal(result, exp)
46+
47+
# explicit regex = True split with compiled regex
48+
regex_pat = re.compile(r".jpg")
49+
values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
50+
result = values.str.split(regex_pat)
51+
exp = Series([["xx", "zzz", ""]])
52+
tm.assert_series_equal(result, exp)
53+
54+
# explicit regex = False split
55+
result = values.str.split(r"\.jpg", regex=False)
56+
exp = Series([["xxxjpgzzz.jpg"]])
57+
tm.assert_series_equal(result, exp)
58+
59+
# non explicit regex split, pattern length == 1
60+
result = values.str.split(r".")
61+
exp = Series([["xxxjpgzzz", "jpg"]])
62+
tm.assert_series_equal(result, exp)
63+
64+
# non explicit regex split, pattern length != 1
65+
result = values.str.split(r".jpg")
66+
exp = Series([["xx", "zzz", ""]])
67+
tm.assert_series_equal(result, exp)
68+
69+
# regex=False with pattern compiled regex raises error
70+
with pytest.raises(
71+
ValueError,
72+
match="Cannot use a compiled regex as replacement pattern with regex=False",
73+
):
74+
values.str.split(regex_pat, regex=False)
75+
76+
3877
def test_split_object_mixed():
3978
mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
4079
result = mixed.str.split("_")

0 commit comments

Comments
 (0)