From 60cc73575ab590f0f28c575e6e7e1fa0db776633 Mon Sep 17 00:00:00 2001 From: Will Badart Date: Fri, 10 Jul 2020 12:03:39 -0700 Subject: [PATCH 1/6] Inline cons_row The abstraction isn't used anywhere else, so let's see if eliminating the function call helps performance at all. --- pandas/core/strings.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a1db7742916de..631373268afa8 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2220,13 +2220,7 @@ def _wrap_result( # required when expand=True is explicitly specified # not needed when inferred - def cons_row(x): - if is_list_like(x): - return x - else: - return [x] - - result = [cons_row(x) for x in result] + result = [x if is_list_like(x) else [x] for x in result] if result: # propagate nan values to match longest sequence (GH 18450) max_len = max(len(x) for x in result) From 7b1caaaadcad307764494358c1345bdcb176b892 Mon Sep 17 00:00:00 2001 From: Will Badart Date: Fri, 10 Jul 2020 12:27:50 -0700 Subject: [PATCH 2/6] Allow split caller to skip expand's sequence padding This can be useful when the user has already ensured that the split will result in a clean rectangle, such as a complete list of well-formed IPv4 addresses being split on ".". It allows them to skip three Python iteration passes over the resulting data. --- pandas/core/strings.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 631373268afa8..206999fd3548c 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2184,6 +2184,7 @@ def _wrap_result( expand=None, fill_value=np.nan, returns_string=True, + pad_sequences=True, ): from pandas import Index, Series, MultiIndex @@ -2216,7 +2217,7 @@ def _wrap_result( # infer from ndim if expand is not specified expand = result.ndim != 1 - elif expand is True and not isinstance(self._orig, ABCIndexClass): + elif expand is True and not isinstance(self._orig, ABCIndexClass) and pad_sequences: # required when expand=True is explicitly specified # not needed when inferred @@ -2679,15 +2680,15 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) @forbid_nonstring_types(["bytes"]) - def split(self, pat=None, n=-1, expand=False): + def split(self, pat=None, n=-1, expand=False, pad_sequences=True): result = str_split(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) + return self._wrap_result(result, expand=expand, returns_string=expand, pad_sequences=pad_sequences) @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) @forbid_nonstring_types(["bytes"]) - def rsplit(self, pat=None, n=-1, expand=False): + def rsplit(self, pat=None, n=-1, expand=False, pad_sequences=True): result = str_rsplit(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) + return self._wrap_result(result, expand=expand, returns_string=expand, pad_sequences=pad_sequences) _shared_docs[ "str_partition" @@ -2706,6 +2707,13 @@ def rsplit(self, pat=None, n=-1, expand=False): expand : bool, default True If True, return DataFrame/MultiIndex expanding dimensionality. If False, return Series/Index. + pad_sequences : bool, default True + If True and expand is True, pad the resulting sequences with nan to + match the length of the longest sequence (so each row has the same + number of columns). + If False, the result of each split must result in sequences of equal + length. + Has no effect when expand is False. Returns ------- From 06505c0464a2f2a0fa12a79a44d4648f7ebf7c01 Mon Sep 17 00:00:00 2001 From: Will Badart Date: Fri, 10 Jul 2020 12:38:44 -0700 Subject: [PATCH 3/6] Apply black formatting --- pandas/core/strings.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 206999fd3548c..cfccd849b0346 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2217,7 +2217,11 @@ def _wrap_result( # infer from ndim if expand is not specified expand = result.ndim != 1 - elif expand is True and not isinstance(self._orig, ABCIndexClass) and pad_sequences: + elif ( + expand is True + and not isinstance(self._orig, ABCIndexClass) + and pad_sequences + ): # required when expand=True is explicitly specified # not needed when inferred @@ -2682,13 +2686,17 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): @forbid_nonstring_types(["bytes"]) def split(self, pat=None, n=-1, expand=False, pad_sequences=True): result = str_split(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand, pad_sequences=pad_sequences) + return self._wrap_result( + result, expand=expand, returns_string=expand, pad_sequences=pad_sequences + ) @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) @forbid_nonstring_types(["bytes"]) def rsplit(self, pat=None, n=-1, expand=False, pad_sequences=True): result = str_rsplit(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand, pad_sequences=pad_sequences) + return self._wrap_result( + result, expand=expand, returns_string=expand, pad_sequences=pad_sequences + ) _shared_docs[ "str_partition" From ae81973de279fb9c1fade03176b6752ad7579263 Mon Sep 17 00:00:00 2001 From: Will Badart Date: Fri, 10 Jul 2020 12:54:57 -0700 Subject: [PATCH 4/6] Corrected placement of pad_sequences documentation --- pandas/core/strings.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index cfccd849b0346..b04e3b26500a7 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2568,6 +2568,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): * If ``True``, return DataFrame/MultiIndex expanding dimensionality. * If ``False``, return Series/Index, containing lists of strings. + pad_sequences : bool, default True + When expand is ``True``, pad the ending of resulting sequences with + ``nan`` to the length of the longest sequence, ensuring each row in the + resulting DataFrame has the same number of columns. + If ``False``, you must be sure pre-hoc that the split strings will have + uniform lengths. + Has no effect when expand is ``False``. Returns ------- @@ -2715,13 +2722,6 @@ def rsplit(self, pat=None, n=-1, expand=False, pad_sequences=True): expand : bool, default True If True, return DataFrame/MultiIndex expanding dimensionality. If False, return Series/Index. - pad_sequences : bool, default True - If True and expand is True, pad the resulting sequences with nan to - match the length of the longest sequence (so each row has the same - number of columns). - If False, the result of each split must result in sequences of equal - length. - Has no effect when expand is False. Returns ------- From cd233bc2d05103606c0720880177cf1dd3837446 Mon Sep 17 00:00:00 2001 From: Will Badart Date: Fri, 10 Jul 2020 16:04:24 -0700 Subject: [PATCH 5/6] Fix pad_sequences logic We can't skip the whole block because, when it starts, result is a numpy array of lists, but it must still be converted to a list of lists for expand to function as expected. --- pandas/core/strings.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b04e3b26500a7..3f5f9885d9397 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2217,21 +2217,20 @@ def _wrap_result( # infer from ndim if expand is not specified expand = result.ndim != 1 - elif ( - expand is True - and not isinstance(self._orig, ABCIndexClass) - and pad_sequences - ): - # required when expand=True is explicitly specified - # not needed when inferred - - result = [x if is_list_like(x) else [x] for x in result] - if result: - # propagate nan values to match longest sequence (GH 18450) - max_len = max(len(x) for x in result) - result = [ - x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result - ] + elif expand is True and not isinstance(self._orig, ABCIndexClass): + if pad_sequences: + # required when expand=True is explicitly specified + # not needed when inferred + result = [x if is_list_like(x) else [x] for x in result] + if result: + # propagate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [ + x * max_len if len(x) == 0 or x[0] is np.nan else x + for x in result + ] + else: + result = result.tolist() if not isinstance(expand, bool): raise ValueError("expand must be True or False") From e242615a2927f2335e3a115ab5e9be1082a9cc7b Mon Sep 17 00:00:00 2001 From: Will Badart Date: Fri, 10 Jul 2020 16:06:59 -0700 Subject: [PATCH 6/6] Add example of pad_sequences usage to split docs --- pandas/core/strings.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 3f5f9885d9397..964e81d2fa2ef 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2686,6 +2686,21 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): >>> s.str.split(r"\+|=", expand=True) 0 1 2 0 1 1 2 + + When using ``expand=True``, if you have already verified that the + particular split will result in sequences of uniform length, you may opt + out of the (sometimes expensive) length normalzation process with + ``pad_sequences=False``. + + >>> s = pd.Series(["foo bar", "baz qaz"]) + >>> s + 0 foo bar + 1 baz qaz + dtype: object + >>> s.str.split(expand=True, pad_sequences=False) + 0 1 + 0 foo bar + 1 baz qaz """ @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})