Skip to content

PERF: Allow str.split callers to skip expensive post-processing #35223

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 44 additions & 20 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2184,6 +2184,7 @@ def _wrap_result(
expand=None,
fill_value=np.nan,
returns_string=True,
pad_sequences=True,
):

from pandas import Index, Series, MultiIndex
Expand Down Expand Up @@ -2217,22 +2218,19 @@ def _wrap_result(
expand = result.ndim != 1

elif expand is True and not isinstance(self._orig, ABCIndexClass):
# required when expand=True is explicitly specified
# not needed when inferred

def cons_row(x):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could just write a short cython routine which does all of this

if is_list_like(x):
return x
else:
return [x]

result = [cons_row(x) for x in result]
if result:
# propagate nan values to match longest sequence (GH 18450)
max_len = max(len(x) for x in result)
result = [
x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result
]
if pad_sequences:
# required when expand=True is explicitly specified
# not needed when inferred
result = [x if is_list_like(x) else [x] for x in result]
if result:
# propagate nan values to match longest sequence (GH 18450)
max_len = max(len(x) for x in result)
result = [
x * max_len if len(x) == 0 or x[0] is np.nan else x
for x in result
]
else:
result = result.tolist()

if not isinstance(expand, bool):
raise ValueError("expand must be True or False")
Expand Down Expand Up @@ -2569,6 +2567,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):

* If ``True``, return DataFrame/MultiIndex expanding dimensionality.
* If ``False``, return Series/Index, containing lists of strings.
pad_sequences : bool, default True
When expand is ``True``, pad the ending of resulting sequences with
``nan`` to the length of the longest sequence, ensuring each row in the
resulting DataFrame has the same number of columns.
If ``False``, you must be sure pre-hoc that the split strings will have
uniform lengths.
Has no effect when expand is ``False``.

Returns
-------
Expand Down Expand Up @@ -2681,19 +2686,38 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
>>> s.str.split(r"\+|=", expand=True)
0 1 2
0 1 1 2

When using ``expand=True``, if you have already verified that the
particular split will result in sequences of uniform length, you may opt
out of the (sometimes expensive) length normalzation process with
``pad_sequences=False``.

>>> s = pd.Series(["foo bar", "baz qaz"])
>>> s
0 foo bar
1 baz qaz
dtype: object
>>> s.str.split(expand=True, pad_sequences=False)
0 1
0 foo bar
1 baz qaz
"""

@Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})
@forbid_nonstring_types(["bytes"])
def split(self, pat=None, n=-1, expand=False):
def split(self, pat=None, n=-1, expand=False, pad_sequences=True):
result = str_split(self._parent, pat, n=n)
return self._wrap_result(result, expand=expand, returns_string=expand)
return self._wrap_result(
result, expand=expand, returns_string=expand, pad_sequences=pad_sequences
)

@Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})
@forbid_nonstring_types(["bytes"])
def rsplit(self, pat=None, n=-1, expand=False):
def rsplit(self, pat=None, n=-1, expand=False, pad_sequences=True):
result = str_rsplit(self._parent, pat, n=n)
return self._wrap_result(result, expand=expand, returns_string=expand)
return self._wrap_result(
result, expand=expand, returns_string=expand, pad_sequences=pad_sequences
)

_shared_docs[
"str_partition"
Expand Down