From 60cc73575ab590f0f28c575e6e7e1fa0db776633 Mon Sep 17 00:00:00 2001
From: Will Badart <badart_william@bah.com>
Date: Fri, 10 Jul 2020 12:03:39 -0700
Subject: [PATCH 1/6] Inline cons_row

The abstraction isn't used anywhere else, so let's see if eliminating
the function call helps performance at all.
---
 pandas/core/strings.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index a1db7742916de..631373268afa8 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -2220,13 +2220,7 @@ def _wrap_result(
             # required when expand=True is explicitly specified
             # not needed when inferred
 
-            def cons_row(x):
-                if is_list_like(x):
-                    return x
-                else:
-                    return [x]
-
-            result = [cons_row(x) for x in result]
+            result = [x if is_list_like(x) else [x] for x in result]
             if result:
                 # propagate nan values to match longest sequence (GH 18450)
                 max_len = max(len(x) for x in result)

From 7b1caaaadcad307764494358c1345bdcb176b892 Mon Sep 17 00:00:00 2001
From: Will Badart <badart_william@bah.com>
Date: Fri, 10 Jul 2020 12:27:50 -0700
Subject: [PATCH 2/6] Allow split caller to skip expand's sequence padding

This can be useful when the user has already ensured that the split will
result in a clean rectangle, such as a complete list of well-formed IPv4
addresses being split on ".". It allows them to skip three Python
iteration passes over the resulting data.
---
 pandas/core/strings.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 631373268afa8..206999fd3548c 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -2184,6 +2184,7 @@ def _wrap_result(
         expand=None,
         fill_value=np.nan,
         returns_string=True,
+        pad_sequences=True,
     ):
 
         from pandas import Index, Series, MultiIndex
@@ -2216,7 +2217,7 @@ def _wrap_result(
             # infer from ndim if expand is not specified
             expand = result.ndim != 1
 
-        elif expand is True and not isinstance(self._orig, ABCIndexClass):
+        elif expand is True and not isinstance(self._orig, ABCIndexClass) and pad_sequences:
             # required when expand=True is explicitly specified
             # not needed when inferred
 
@@ -2679,15 +2680,15 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
 
     @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})
     @forbid_nonstring_types(["bytes"])
-    def split(self, pat=None, n=-1, expand=False):
+    def split(self, pat=None, n=-1, expand=False, pad_sequences=True):
         result = str_split(self._parent, pat, n=n)
-        return self._wrap_result(result, expand=expand, returns_string=expand)
+        return self._wrap_result(result, expand=expand, returns_string=expand, pad_sequences=pad_sequences)
 
     @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})
     @forbid_nonstring_types(["bytes"])
-    def rsplit(self, pat=None, n=-1, expand=False):
+    def rsplit(self, pat=None, n=-1, expand=False, pad_sequences=True):
         result = str_rsplit(self._parent, pat, n=n)
-        return self._wrap_result(result, expand=expand, returns_string=expand)
+        return self._wrap_result(result, expand=expand, returns_string=expand, pad_sequences=pad_sequences)
 
     _shared_docs[
         "str_partition"
@@ -2706,6 +2707,13 @@ def rsplit(self, pat=None, n=-1, expand=False):
     expand : bool, default True
         If True, return DataFrame/MultiIndex expanding dimensionality.
         If False, return Series/Index.
+    pad_sequences : bool, default True
+        If True and expand is True, pad the resulting sequences with nan to
+        match the length of the longest sequence (so each row has the same
+        number of columns).
+        If False, the result of each split must result in sequences of equal
+        length.
+        Has no effect when expand is False.
 
     Returns
     -------

From 06505c0464a2f2a0fa12a79a44d4648f7ebf7c01 Mon Sep 17 00:00:00 2001
From: Will Badart <badart_william@bah.com>
Date: Fri, 10 Jul 2020 12:38:44 -0700
Subject: [PATCH 3/6] Apply black formatting

---
 pandas/core/strings.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 206999fd3548c..cfccd849b0346 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -2217,7 +2217,11 @@ def _wrap_result(
             # infer from ndim if expand is not specified
             expand = result.ndim != 1
 
-        elif expand is True and not isinstance(self._orig, ABCIndexClass) and pad_sequences:
+        elif (
+            expand is True
+            and not isinstance(self._orig, ABCIndexClass)
+            and pad_sequences
+        ):
             # required when expand=True is explicitly specified
             # not needed when inferred
 
@@ -2682,13 +2686,17 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
     @forbid_nonstring_types(["bytes"])
     def split(self, pat=None, n=-1, expand=False, pad_sequences=True):
         result = str_split(self._parent, pat, n=n)
-        return self._wrap_result(result, expand=expand, returns_string=expand, pad_sequences=pad_sequences)
+        return self._wrap_result(
+            result, expand=expand, returns_string=expand, pad_sequences=pad_sequences
+        )
 
     @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})
     @forbid_nonstring_types(["bytes"])
     def rsplit(self, pat=None, n=-1, expand=False, pad_sequences=True):
         result = str_rsplit(self._parent, pat, n=n)
-        return self._wrap_result(result, expand=expand, returns_string=expand, pad_sequences=pad_sequences)
+        return self._wrap_result(
+            result, expand=expand, returns_string=expand, pad_sequences=pad_sequences
+        )
 
     _shared_docs[
         "str_partition"

From ae81973de279fb9c1fade03176b6752ad7579263 Mon Sep 17 00:00:00 2001
From: Will Badart <badart_william@bah.com>
Date: Fri, 10 Jul 2020 12:54:57 -0700
Subject: [PATCH 4/6] Corrected placement of pad_sequences documentation

---
 pandas/core/strings.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index cfccd849b0346..b04e3b26500a7 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -2568,6 +2568,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
 
         * If ``True``, return DataFrame/MultiIndex expanding dimensionality.
         * If ``False``, return Series/Index, containing lists of strings.
+    pad_sequences : bool, default True
+        When expand is ``True``, pad the ending of resulting sequences with
+        ``nan`` to the length of the longest sequence, ensuring each row in the
+        resulting DataFrame has the same number of columns.
+        If ``False``, you must be sure pre-hoc that the split strings will have
+        uniform lengths.
+        Has no effect when expand is ``False``.
 
     Returns
     -------
@@ -2715,13 +2722,6 @@ def rsplit(self, pat=None, n=-1, expand=False, pad_sequences=True):
     expand : bool, default True
         If True, return DataFrame/MultiIndex expanding dimensionality.
         If False, return Series/Index.
-    pad_sequences : bool, default True
-        If True and expand is True, pad the resulting sequences with nan to
-        match the length of the longest sequence (so each row has the same
-        number of columns).
-        If False, the result of each split must result in sequences of equal
-        length.
-        Has no effect when expand is False.
 
     Returns
     -------

From cd233bc2d05103606c0720880177cf1dd3837446 Mon Sep 17 00:00:00 2001
From: Will Badart <badart_william@bah.com>
Date: Fri, 10 Jul 2020 16:04:24 -0700
Subject: [PATCH 5/6] Fix pad_sequences logic

We can't skip the whole block because, when it starts, result is a numpy
array of lists, but it must still be converted to a list of lists for
expand to function as expected.
---
 pandas/core/strings.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index b04e3b26500a7..3f5f9885d9397 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -2217,21 +2217,20 @@ def _wrap_result(
             # infer from ndim if expand is not specified
             expand = result.ndim != 1
 
-        elif (
-            expand is True
-            and not isinstance(self._orig, ABCIndexClass)
-            and pad_sequences
-        ):
-            # required when expand=True is explicitly specified
-            # not needed when inferred
-
-            result = [x if is_list_like(x) else [x] for x in result]
-            if result:
-                # propagate nan values to match longest sequence (GH 18450)
-                max_len = max(len(x) for x in result)
-                result = [
-                    x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result
-                ]
+        elif expand is True and not isinstance(self._orig, ABCIndexClass):
+            if pad_sequences:
+                # required when expand=True is explicitly specified
+                # not needed when inferred
+                result = [x if is_list_like(x) else [x] for x in result]
+                if result:
+                    # propagate nan values to match longest sequence (GH 18450)
+                    max_len = max(len(x) for x in result)
+                    result = [
+                        x * max_len if len(x) == 0 or x[0] is np.nan else x
+                        for x in result
+                    ]
+            else:
+                result = result.tolist()
 
         if not isinstance(expand, bool):
             raise ValueError("expand must be True or False")

From e242615a2927f2335e3a115ab5e9be1082a9cc7b Mon Sep 17 00:00:00 2001
From: Will Badart <badart_william@bah.com>
Date: Fri, 10 Jul 2020 16:06:59 -0700
Subject: [PATCH 6/6] Add example of pad_sequences usage to split docs

---
 pandas/core/strings.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 3f5f9885d9397..964e81d2fa2ef 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -2686,6 +2686,21 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
     >>> s.str.split(r"\+|=", expand=True)
          0    1    2
     0    1    1    2
+
+    When using ``expand=True``, if you have already verified that the
+    particular split will result in sequences of uniform length, you may opt
+    out of the (sometimes expensive) length normalzation process with
+    ``pad_sequences=False``.
+
+    >>> s = pd.Series(["foo bar", "baz qaz"])
+    >>> s
+    0    foo bar
+    1    baz qaz
+    dtype: object
+    >>> s.str.split(expand=True, pad_sequences=False)
+         0    1
+    0  foo  bar
+    1  baz  qaz
     """
 
     @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})