diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a1db7742916de..964e81d2fa2ef 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2184,6 +2184,7 @@ def _wrap_result( expand=None, fill_value=np.nan, returns_string=True, + pad_sequences=True, ): from pandas import Index, Series, MultiIndex @@ -2217,22 +2218,19 @@ def _wrap_result( expand = result.ndim != 1 elif expand is True and not isinstance(self._orig, ABCIndexClass): - # required when expand=True is explicitly specified - # not needed when inferred - - def cons_row(x): - if is_list_like(x): - return x - else: - return [x] - - result = [cons_row(x) for x in result] - if result: - # propagate nan values to match longest sequence (GH 18450) - max_len = max(len(x) for x in result) - result = [ - x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result - ] + if pad_sequences: + # required when expand=True is explicitly specified + # not needed when inferred + result = [x if is_list_like(x) else [x] for x in result] + if result: + # propagate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [ + x * max_len if len(x) == 0 or x[0] is np.nan else x + for x in result + ] + else: + result = result.tolist() if not isinstance(expand, bool): raise ValueError("expand must be True or False") @@ -2569,6 +2567,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): * If ``True``, return DataFrame/MultiIndex expanding dimensionality. * If ``False``, return Series/Index, containing lists of strings. + pad_sequences : bool, default True + When expand is ``True``, pad the ending of resulting sequences with + ``nan`` to the length of the longest sequence, ensuring each row in the + resulting DataFrame has the same number of columns. + If ``False``, you must be sure pre-hoc that the split strings will have + uniform lengths. + Has no effect when expand is ``False``. Returns ------- @@ -2681,19 +2686,38 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): >>> s.str.split(r"\+|=", expand=True) 0 1 2 0 1 1 2 + + When using ``expand=True``, if you have already verified that the + particular split will result in sequences of uniform length, you may opt + out of the (sometimes expensive) length normalzation process with + ``pad_sequences=False``. + + >>> s = pd.Series(["foo bar", "baz qaz"]) + >>> s + 0 foo bar + 1 baz qaz + dtype: object + >>> s.str.split(expand=True, pad_sequences=False) + 0 1 + 0 foo bar + 1 baz qaz """ @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) @forbid_nonstring_types(["bytes"]) - def split(self, pat=None, n=-1, expand=False): + def split(self, pat=None, n=-1, expand=False, pad_sequences=True): result = str_split(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) + return self._wrap_result( + result, expand=expand, returns_string=expand, pad_sequences=pad_sequences + ) @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) @forbid_nonstring_types(["bytes"]) - def rsplit(self, pat=None, n=-1, expand=False): + def rsplit(self, pat=None, n=-1, expand=False, pad_sequences=True): result = str_rsplit(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) + return self._wrap_result( + result, expand=expand, returns_string=expand, pad_sequences=pad_sequences + ) _shared_docs[ "str_partition"