From 14d6cb3caae882ac642c2d8d53d9f65d27b75e4e Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Mon, 14 May 2018 00:12:30 +0100 Subject: [PATCH 01/12] Updated docstring for str.rsplit --- pandas/core/strings.py | 49 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5d50c45fe7eca..d567a952e758b 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1481,7 +1481,54 @@ def str_rsplit(arr, pat=None, n=None): Returns ------- - split : Series/Index or DataFrame/MultiIndex of objects + Series/Index or DataFrame/MultiIndex of objects + + See Also + -------- + str.rsplit : Standard library version of this method. + + Examples + -------- + >>> s = pd.Series(["this is good text", "but this is even better"]) + + By default, split will return an object of the same size + having lists containing the split elements + + >>> s.str.rsplit() + 0 [this, is, good, text] + 1 [but, this, is, even, better] + dtype: object + >>> s.str.rsplit("random") + 0 [this is good text] + 1 [but this is even better] + dtype: object + + When using ''expand=True'', the split elements will expand out into + separate columns. + + For Series object, output return type is DataFrame. + + >>> s.str.rsplit(expand=True) + 0 1 2 3 4 + 0 this is good text None + 1 but this is even better + + Parameter 'n' can be used to limit the number of splits in the output. + + >>> s.str.rsplit("is", n=1) + 0 [this , good text] + 1 [but this , even better] + dtype: object + + If NaN is present, it is propagated throughout the columns + during the split. + + >>> s = pd.Series(["this is good text", "but this is even better", np.nan]) + >>> s.str.rsplit(n=3, expand=True) + 0 1 2 3 + 0 this is good text + 1 but this is even better + 2 NaN NaN NaN NaN """ if n is None or n == 0: n = -1 From e79511e6e00d699f5bc9900f1319c0b1aa23d9cb Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Mon, 14 May 2018 01:18:28 +0100 Subject: [PATCH 02/12] Update docstring following validation test --- pandas/core/strings.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index d567a952e758b..08035e4888f8e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1465,19 +1465,21 @@ def str_split(arr, pat=None, n=None): def str_rsplit(arr, pat=None, n=None): """ - Split each string in the Series/Index by the given delimiter - string, starting at the end of the string and working to the front. - Equivalent to :meth:`str.rsplit`. + Split strings around given separator/delimiter. + + Returns a list of the words from each string in + Series/Index, separated by the delimiter string + (starting from the right). Equivalent to :meth:`str.rsplit`. Parameters ---------- pat : string, default None - Separator to split on. If None, splits on whitespace + Separator to split on. If None, splits on whitespace. n : int, default -1 (all) - None, 0 and -1 will be interpreted as return all splits + None, 0 and -1 will be interpreted as return all splits. expand : bool, default False - * If True, return DataFrame/MultiIndex expanding dimensionality. - * If False, return Series/Index. + If True, return DataFrame/MultiIndex expanding dimensionality. + If False, return Series/Index. Returns ------- From 77d8dad3c351af4f5c49c874d3ae8de5a48047ff Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Mon, 14 May 2018 01:30:36 +0100 Subject: [PATCH 03/12] Remove trailing whitespace line 1470 --- pandas/core/strings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 08035e4888f8e..8276a092e2c54 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1467,8 +1467,8 @@ def str_rsplit(arr, pat=None, n=None): """ Split strings around given separator/delimiter. - Returns a list of the words from each string in - Series/Index, separated by the delimiter string + Returns a list of the words from each string in + Series/Index, separated by the delimiter string (starting from the right). Equivalent to :meth:`str.rsplit`. Parameters From 2a1ea740dcf4ff6171c32116ae52737ce3c11eeb Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sun, 3 Jun 2018 15:40:51 +0100 Subject: [PATCH 04/12] DOC: Created reusable docstring for split and rsplit --- pandas/core/strings.py | 272 +++++++++++++++++++---------------------- 1 file changed, 127 insertions(+), 145 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 8276a092e2c54..ffcdfe26f6240 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1346,104 +1346,20 @@ def str_split(arr, pat=None, n=None): """ Split strings around given separator/delimiter. - Split each string in the caller's values by given - pattern, propagating NaN values. Equivalent to :meth:`str.split`. - Parameters ---------- pat : str, optional - String or regular expression to split on. - If not specified, split on whitespace. + String or regular expression to split on; If not specified, + split on whitespace. n : int, default -1 (all) - Limit number of splits in output. - ``None``, 0 and -1 will be interpreted as return all splits. + Limit number of splits in output; ``None``, 0 and -1 will + be interpreted as return all splits. expand : bool, default False Expand the splitted strings into separate columns. - * If ``True``, return DataFrame/MultiIndex expanding dimensionality. - * If ``False``, return Series/Index, containing lists of strings. - Returns ------- - Series, Index, DataFrame or MultiIndex - Type matches caller unless ``expand=True`` (see Notes). - - Notes - ----- - The handling of the `n` keyword depends on the number of found splits: - - - If found splits > `n`, make first `n` splits only - - If found splits <= `n`, make all splits - - If for a certain row the number of found splits < `n`, - append `None` for padding up to `n` if ``expand=True`` - - If using ``expand=True``, Series and Index callers return DataFrame and - MultiIndex objects, respectively. - - See Also - -------- - str.split : Standard library version of this method. - Series.str.get_dummies : Split each string into dummy variables. - Series.str.partition : Split string on a separator, returning - the before, separator, and after components. - - Examples - -------- - >>> s = pd.Series(["this is good text", "but this is even better"]) - - By default, split will return an object of the same size - having lists containing the split elements - - >>> s.str.split() - 0 [this, is, good, text] - 1 [but, this, is, even, better] - dtype: object - >>> s.str.split("random") - 0 [this is good text] - 1 [but this is even better] - dtype: object - - When using ``expand=True``, the split elements will expand out into - separate columns. - - For Series object, output return type is DataFrame. - - >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is good text None - 1 but this is even better - >>> s.str.split(" is ", expand=True) - 0 1 - 0 this good text - 1 but this even better - - For Index object, output return type is MultiIndex. - - >>> i = pd.Index(["ba 100 001", "ba 101 002", "ba 102 003"]) - >>> i.str.split(expand=True) - MultiIndex(levels=[['ba'], ['100', '101', '102'], ['001', '002', '003']], - labels=[[0, 0, 0], [0, 1, 2], [0, 1, 2]]) - - Parameter `n` can be used to limit the number of splits in the output. - - >>> s.str.split("is", n=1) - 0 [th, is good text] - 1 [but th, is even better] - dtype: object - >>> s.str.split("is", n=1, expand=True) - 0 1 - 0 th is good text - 1 but th is even better - - If NaN is present, it is propagated throughout the columns - during the split. - - >>> s = pd.Series(["this is good text", "but this is even better", np.nan]) - >>> s.str.split(n=3, expand=True) - 0 1 2 3 - 0 this is good text - 1 but this is even better - 2 NaN NaN NaN NaN + Series, Index, DataFrame or MultiIndex """ if pat is None: if n is None or n == 0: @@ -1465,16 +1381,13 @@ def str_split(arr, pat=None, n=None): def str_rsplit(arr, pat=None, n=None): """ - Split strings around given separator/delimiter. - - Returns a list of the words from each string in - Series/Index, separated by the delimiter string - (starting from the right). Equivalent to :meth:`str.rsplit`. + Split strings around given separator/delimiter (starting from + the right). Parameters ---------- pat : string, default None - Separator to split on. If None, splits on whitespace. + Separator to split on; If None, splits on whitespace. n : int, default -1 (all) None, 0 and -1 will be interpreted as return all splits. expand : bool, default False @@ -1483,54 +1396,7 @@ def str_rsplit(arr, pat=None, n=None): Returns ------- - Series/Index or DataFrame/MultiIndex of objects - - See Also - -------- - str.rsplit : Standard library version of this method. - - Examples - -------- - >>> s = pd.Series(["this is good text", "but this is even better"]) - - By default, split will return an object of the same size - having lists containing the split elements - - >>> s.str.rsplit() - 0 [this, is, good, text] - 1 [but, this, is, even, better] - dtype: object - >>> s.str.rsplit("random") - 0 [this is good text] - 1 [but this is even better] - dtype: object - - When using ''expand=True'', the split elements will expand out into - separate columns. - - For Series object, output return type is DataFrame. - - >>> s.str.rsplit(expand=True) - 0 1 2 3 4 - 0 this is good text None - 1 but this is even better - - Parameter 'n' can be used to limit the number of splits in the output. - - >>> s.str.rsplit("is", n=1) - 0 [this , good text] - 1 [but this , even better] - dtype: object - - If NaN is present, it is propagated throughout the columns - during the split. - - >>> s = pd.Series(["this is good text", "but this is even better", np.nan]) - >>> s.str.rsplit(n=3, expand=True) - 0 1 2 3 - 0 this is good text - 1 but this is even better - 2 NaN NaN NaN NaN + Series/Index or DataFrame/MultiIndex of objects """ if n is None or n == 0: n = -1 @@ -2374,12 +2240,128 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): res = Series(res, index=data.index, name=self._orig.name) return res - @copy(str_split) + _shared_docs['str_split'] = (""" + Split strings around given separator/delimiter. + + Returns a list of the words from each string in Series/Index, + split by the given delimiter string, starting at the %(side)s of the + string. Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + pat : str, optional + String or regular expression to split on. + If not specified, split on whitespace. + n : int, default -1 (all) + Limit number of splits in output. + ``None``, 0 and -1 will be interpreted as return all splits. + expand : bool, default False + Expand the splitted strings into separate columns. + + * If ``True``, return DataFrame/MultiIndex expanding dimensionality. + * If ``False``, return Series/Index, containing lists of strings. + + Returns + ------- + Series, Index, DataFrame or MultiIndex + Type matches caller unless ``expand=True`` (see Notes). + + Notes + ----- + The handling of the `n` keyword depends on the number of found splits: + + - If found splits > `n`, make first `n` splits only + - If found splits <= `n`, make all splits + - If for a certain row the number of found splits < `n`, + append `None` for padding up to `n` if ``expand=True`` + + If using ``expand=True``, Series and Index callers return DataFrame and + MultiIndex objects, respectively. + + See Also + -------- + %(also)s + + Examples + -------- + >>> s = pd.Series(["this is good text", "but this is even better"]) + + By default, split and rsplit will return an object of the same size + having lists containing the split elements + + >>> s.str.split() + 0 [this, is, good, text] + 1 [but, this, is, even, better] + dtype: object + + >>> s.str.rsplit() + 0 [this, is, good, text] + 1 [but, this, is, even, better] + dtype: object + + >>> s.str.split("random") + 0 [this is good text] + 1 [but this is even better] + dtype: object + + >>> s.str.rsplit("random") + 0 [this is good text] + 1 [but this is even better] + dtype: object + + When using ``expand=True``, the split and rsplit elements will expand out into + separate columns. + + For Series object, output return type is DataFrame. + + >>> s.str.split(expand=True) + 0 1 2 3 4 + 0 this is good text None + 1 but this is even better + + >>> s.str.split(" is ", expand=True) + 0 1 + 0 this good text + 1 but this even better + + Parameter `n` can be used to limit the number of splits in the output. + + >>> s.str.split("is", n=1) + 0 [th, is good text] + 1 [but th, is even better] + dtype: object + + >>> s.str.rsplit("is", n=1) + 0 [this , good text] + 1 [but this , even better] + dtype: object + + If NaN is present, it is propagated throughout the columns + during the split. + + >>> s = pd.Series(["this is good text", "but this is even better", np.nan]) + + >>> s.str.split(n=3, expand=True) + 0 1 2 3 + 0 this is good text + 1 but this is even better + 2 NaN NaN NaN NaN + + >>> s.str.rsplit(n=3, expand=True) + 0 1 2 3 + 0 this is good text + 1 but this is even better + 2 NaN NaN NaN NaN + """) + + @Appender(_shared_docs['str_split'] % dict(side='start', + method='split')) def split(self, pat=None, n=-1, expand=False): result = str_split(self._data, pat, n=n) return self._wrap_result(result, expand=expand) - @copy(str_rsplit) + @Appender(_shared_docs['str_split'] % dict(side='end', + method='rsplit')) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._data, pat, n=n) return self._wrap_result(result, expand=expand) From 522ca66dbcae862dd457e0a2e730a644dc3b3ec4 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sun, 3 Jun 2018 16:37:08 +0100 Subject: [PATCH 05/12] DOC: Created reusable docstring for split and rsplit --- pandas/core/strings.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ffcdfe26f6240..fb08748d1436c 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1349,17 +1349,17 @@ def str_split(arr, pat=None, n=None): Parameters ---------- pat : str, optional - String or regular expression to split on; If not specified, + String or regular expression to split on; If not specified, split on whitespace. n : int, default -1 (all) - Limit number of splits in output; ``None``, 0 and -1 will + Limit number of splits in output; ``None``, 0 and -1 will be interpreted as return all splits. expand : bool, default False Expand the splitted strings into separate columns. Returns ------- - Series, Index, DataFrame or MultiIndex + Series, Index, DataFrame or MultiIndex """ if pat is None: if n is None or n == 0: @@ -1381,7 +1381,7 @@ def str_split(arr, pat=None, n=None): def str_rsplit(arr, pat=None, n=None): """ - Split strings around given separator/delimiter (starting from + Split strings around given separator/delimiter (starting from the right). Parameters @@ -2243,8 +2243,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): _shared_docs['str_split'] = (""" Split strings around given separator/delimiter. - Returns a list of the words from each string in Series/Index, - split by the given delimiter string, starting at the %(side)s of the + Returns a list of the words from each string in Series/Index, + split by the given delimiter string, starting at the %(side)s of the string. Equivalent to :meth:`str.%(method)s`. Parameters @@ -2309,8 +2309,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): 1 [but this is even better] dtype: object - When using ``expand=True``, the split and rsplit elements will expand out into - separate columns. + When using ``expand=True``, the split and rsplit elements will + expand out into separate columns. For Series object, output return type is DataFrame. From 25bf40be2c95a95aa93e89dd8856ac0135d306f0 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Tue, 5 Jun 2018 01:07:00 +0100 Subject: [PATCH 06/12] Remove trailing whitespace --- pandas/core/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index fb08748d1436c..24467a4eed3bd 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1359,7 +1359,7 @@ def str_split(arr, pat=None, n=None): Returns ------- - Series, Index, DataFrame or MultiIndex + Series, Index, DataFrame or MultiIndex """ if pat is None: if n is None or n == 0: From 4d7d71b7c342611efeed54210549270fc8b88f37 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Sun, 10 Jun 2018 00:00:36 +0100 Subject: [PATCH 07/12] Modified see also --- pandas/core/strings.py | 61 +++++++++++------------------------------- 1 file changed, 16 insertions(+), 45 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 24467a4eed3bd..c06f25f6e568c 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1343,24 +1343,7 @@ def str_pad(arr, width, side='left', fillchar=' '): def str_split(arr, pat=None, n=None): - """ - Split strings around given separator/delimiter. - - Parameters - ---------- - pat : str, optional - String or regular expression to split on; If not specified, - split on whitespace. - n : int, default -1 (all) - Limit number of splits in output; ``None``, 0 and -1 will - be interpreted as return all splits. - expand : bool, default False - Expand the splitted strings into separate columns. - - Returns - ------- - Series, Index, DataFrame or MultiIndex - """ + if pat is None: if n is None or n == 0: n = -1 @@ -1380,24 +1363,7 @@ def str_split(arr, pat=None, n=None): def str_rsplit(arr, pat=None, n=None): - """ - Split strings around given separator/delimiter (starting from - the right). - - Parameters - ---------- - pat : string, default None - Separator to split on; If None, splits on whitespace. - n : int, default -1 (all) - None, 0 and -1 will be interpreted as return all splits. - expand : bool, default False - If True, return DataFrame/MultiIndex expanding dimensionality. - If False, return Series/Index. - - Returns - ------- - Series/Index or DataFrame/MultiIndex of objects - """ + if n is None or n == 0: n = -1 f = lambda x: x.rsplit(pat, n) @@ -2243,9 +2209,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): _shared_docs['str_split'] = (""" Split strings around given separator/delimiter. - Returns a list of the words from each string in Series/Index, - split by the given delimiter string, starting at the %(side)s of the - string. Equivalent to :meth:`str.%(method)s`. + Splits the string in the Series/Index from the %(side)s, + at the specified delimiter string.Equivalent to :meth:`str.%(method)s`. Parameters ---------- @@ -2294,7 +2259,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): 1 [but, this, is, even, better] dtype: object - >>> s.str.rsplit() + >>> s.str.rsplit() 0 [this, is, good, text] 1 [but, this, is, even, better] dtype: object @@ -2345,7 +2310,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): 0 1 2 3 0 this is good text 1 but this is even better - 2 NaN NaN NaN NaN + 2 NaN NaN NaN NaN >>> s.str.rsplit(n=3, expand=True) 0 1 2 3 @@ -2354,14 +2319,20 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): 2 NaN NaN NaN NaN """) - @Appender(_shared_docs['str_split'] % dict(side='start', - method='split')) + @Appender(_shared_docs['str_split'] % { + 'side': 'beginning', + 'method': 'split', + 'also': 'rsplit : Splits string at the last occurrence of delimiter' + }) def split(self, pat=None, n=-1, expand=False): result = str_split(self._data, pat, n=n) return self._wrap_result(result, expand=expand) - @Appender(_shared_docs['str_split'] % dict(side='end', - method='rsplit')) + @Appender(_shared_docs['str_split'] % { + 'side': 'end', + 'method': 'rsplit', + 'also': 'split : Splits string at the first occurrence of delimiter' + }) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._data, pat, n=n) return self._wrap_result(result, expand=expand) From 773cfbf37cded11585f2e9b8fdeac44b3d97d547 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Tue, 19 Jun 2018 02:29:34 +0100 Subject: [PATCH 08/12] Modified examples and See Also --- pandas/core/strings.py | 114 +++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 60 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index c06f25f6e568c..69cce07662d54 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2210,7 +2210,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): Split strings around given separator/delimiter. Splits the string in the Series/Index from the %(side)s, - at the specified delimiter string.Equivalent to :meth:`str.%(method)s`. + at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. Parameters ---------- @@ -2245,84 +2245,79 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): See Also -------- - %(also)s + Series.str.split : Split strings around given separator/delimiter. + Series.str.rsplit : Splits string around given separator/delimiter, starting from + the right. + Series.str.join : Join lists contained as elements in the Series/Index + with passed delimiter. + str.split : Standard library version for split. + str.rsplit : Standard library version for rsplit. Examples -------- - >>> s = pd.Series(["this is good text", "but this is even better"]) + >>> s = pd.Series(["this is a regular sentence", "this,is,comma,separated,text", np.nan]) By default, split and rsplit will return an object of the same size - having lists containing the split elements + having lists containing the split elements. + + Parameter `n` can be used to limit the number of splits on the delimiter. If delimiter is + not specified, string is split on whitespace. - >>> s.str.split() - 0 [this, is, good, text] - 1 [but, this, is, even, better] + >>> s.str.split(n=2) + 0 [this, is, a regular sentence] + 1 [this,is,comma,separated,text] + 2 NaN dtype: object - >>> s.str.rsplit() - 0 [this, is, good, text] - 1 [but, this, is, even, better] + >>> s.str.rsplit(n=2) + 0 [this is a, regular, sentence] + 1 [this,is,comma,separated,text] + 2 NaN dtype: object - >>> s.str.split("random") - 0 [this is good text] - 1 [but this is even better] + >>> s.str.split(",", n=2) + 0 [this is a regular sentence] + 1 [this, is, comma,separated,text] + 2 NaN dtype: object - >>> s.str.rsplit("random") - 0 [this is good text] - 1 [but this is even better] + >>> s.str.rsplit(",", n=2) + 0 [this is a regular sentence] + 1 [this,is,comma, separated, text] + 2 NaN dtype: object When using ``expand=True``, the split and rsplit elements will expand out into separate columns. - For Series object, output return type is DataFrame. - - >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is good text None - 1 but this is even better - - >>> s.str.split(" is ", expand=True) - 0 1 - 0 this good text - 1 but this even better - - Parameter `n` can be used to limit the number of splits in the output. - - >>> s.str.split("is", n=1) - 0 [th, is good text] - 1 [but th, is even better] - dtype: object - - >>> s.str.rsplit("is", n=1) - 0 [this , good text] - 1 [but this , even better] - dtype: object - - If NaN is present, it is propagated throughout the columns - during the split. - - >>> s = pd.Series(["this is good text", "but this is even better", np.nan]) - - >>> s.str.split(n=3, expand=True) - 0 1 2 3 - 0 this is good text - 1 but this is even better - 2 NaN NaN NaN NaN - - >>> s.str.rsplit(n=3, expand=True) - 0 1 2 3 - 0 this is good text - 1 but this is even better - 2 NaN NaN NaN NaN + >>> s.str.split(n=2, expand=True) + 0 1 2 + 0 this is a regular sentence + 1 this,is,comma,separated,text None None + 2 NaN NaN NaN + + >>> s.str.rsplit(n=2, expand=True) + 0 1 2 + 0 this is a regular sentence + 1 this,is,comma,separated,text None None + 2 NaN NaN NaN + + >>> s.str.split(",", n=2, expand=True) + 0 1 2 + 0 this is a regular sentence None None + 1 this is comma,separated,text + 2 NaN NaN NaN + + >>> s.str.rsplit(",", n=2, expand=True) + 0 1 2 + 0 this is a regular sentence None None + 1 this,is,comma separated text + 2 NaN NaN NaN """) @Appender(_shared_docs['str_split'] % { 'side': 'beginning', - 'method': 'split', - 'also': 'rsplit : Splits string at the last occurrence of delimiter' + 'method': 'split' }) def split(self, pat=None, n=-1, expand=False): result = str_split(self._data, pat, n=n) @@ -2330,8 +2325,7 @@ def split(self, pat=None, n=-1, expand=False): @Appender(_shared_docs['str_split'] % { 'side': 'end', - 'method': 'rsplit', - 'also': 'split : Splits string at the first occurrence of delimiter' + 'method': 'rsplit' }) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._data, pat, n=n) From e9857e1ec0824c9f389785921b64451d83e2a1d5 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Thu, 21 Jun 2018 02:52:37 +0100 Subject: [PATCH 09/12] Update Examples 21 June 2018 --- pandas/core/strings.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b1a2a60b73ace..920519b8ae9d3 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2255,8 +2255,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): Examples -------- - >>> s = pd.Series(["this is a regular sentence", - "path/to/python/file", np.nan]) + >>> s = pd.Series(["this is a regular sentence", "path/to/python/file", np.nan]) By default, split and rsplit will return an object of the same size having lists containing the split elements. From 190f0cb583e4e10b457766843330af89722c1bbb Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Fri, 22 Jun 2018 01:10:57 +0100 Subject: [PATCH 10/12] Examples - final version --- pandas/core/strings.py | 72 ++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 920519b8ae9d3..95d693b138402 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2255,30 +2255,45 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): Examples -------- - >>> s = pd.Series(["this is a regular sentence", "path/to/python/file", np.nan]) + >>> s = pd.Series(["this is a regular sentence", "https://docs.python.org/3/tutorial/index.html", np.nan]) - By default, split and rsplit will return an object of the same size - having lists containing the split elements. + In the default setting, the string is split by whitespace. >>> s.str.split() - 0 [this, is, a, regular, sentence] - 1 [path/to/python/file] - 2 NaN + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN dtype: object - Parameter `n` can be used to limit the number of splits on the - delimiter. + Without the `n` parameter, the outputs of `rsplit` and `split` are identical. - >>> s.str.split("/", n=2) - 0 [this is a regular sentence] - 1 [path, to, python/file] - 2 NaN - dtype: object + >>> s.str.rsplit() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `n` parameter can be used to limit the number of splits on the + delimiter. The outputs of `split` and `rsplit` are different. + + >>> s.str.split(n=2) + 0 [this, is, a regular sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object - >>> s.str.rsplit("/", n=2) - 0 [this is a regular sentence] - 1 [path/to, python, file] - 2 NaN + >>> s.str.rsplit(n=2) + 0 [this is a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `pat` parameter can be used to split by other characters. + + >>> s.str.split(pat = "/") + 0 [this is a regular sentence] + 1 [https:, , docs.python.org, 3, tutorial, index... + 2 NaN dtype: object When using ``expand=True``, the split elements will expand out into @@ -2286,16 +2301,19 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): the columns during the split. >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is a regular sentence - 1 path/to/python/file None None None None - 2 NaN NaN NaN NaN NaN - - >>> s.str.split("/", n=2, expand=True) - 0 1 2 - 0 this is a regular sentence None None - 1 path to python/file - 2 NaN NaN NaN + 0 1 2 3 4 + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html None None None None + 2 NaN NaN NaN NaN NaN + + For slightly more complex use cases like splitting the html document name + from a url, a combination of parameter settings can be used. + + >>> s.str.rsplit("/", n=1, expand=True) + 0 1 + 0 this is a regular sentence None + 1 https://docs.python.org/3/tutorial index.html + 2 NaN NaN """) @Appender(_shared_docs['str_split'] % { From 390d351a4ab502d75dfced6ddcfb8176fe0052da Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 22 Jun 2018 06:20:01 -0400 Subject: [PATCH 11/12] some lint --- pandas/core/strings.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index c2ff9bd1d666d..7c1fef035cbf7 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1363,7 +1363,7 @@ def str_split(arr, pat=None, n=None): def str_rsplit(arr, pat=None, n=None): - + if n is None or n == 0: n = -1 f = lambda x: x.rsplit(pat, n) @@ -2234,10 +2234,10 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): See Also -------- Series.str.split : Split strings around given separator/delimiter. - Series.str.rsplit : Splits string around given separator/delimiter, + Series.str.rsplit : Splits string around given separator/delimiter, starting from the right. - Series.str.join : Join lists contained as elements in the Series/Index - with passed delimiter. + Series.str.join : Join lists contained as elements in the Series/Index + with passed delimiter. str.split : Standard library version for split. str.rsplit : Standard library version for rsplit. @@ -2255,7 +2255,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): Examples -------- - >>> s = pd.Series(["this is a regular sentence", "https://docs.python.org/3/tutorial/index.html", np.nan]) + >>> s = pd.Series(["this is a regular sentence", "https://docs.python.org/3/tutorial/index.html", np.nan]) In the default setting, the string is split by whitespace. @@ -2276,7 +2276,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): The `n` parameter can be used to limit the number of splits on the delimiter. The outputs of `split` and `rsplit` are different. - >>> s.str.split(n=2) + >>> s.str.split(n=2) 0 [this, is, a regular sentence] 1 [https://docs.python.org/3/tutorial/index.html] 2 NaN @@ -2288,7 +2288,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): 2 NaN dtype: object - The `pat` parameter can be used to split by other characters. + The `pat` parameter can be used to split by other characters. >>> s.str.split(pat = "/") 0 [this is a regular sentence] @@ -2297,12 +2297,12 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): dtype: object When using ``expand=True``, the split elements will expand out into - separate columns. If NaN is present, it is propagated throughout + separate columns. If NaN is present, it is propagated throughout the columns during the split. >>> s.str.split(expand=True) 0 1 2 3 4 - 0 this is a regular sentence + 0 this is a regular sentence 1 https://docs.python.org/3/tutorial/index.html None None None None 2 NaN NaN NaN NaN NaN @@ -2318,16 +2318,14 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): @Appender(_shared_docs['str_split'] % { 'side': 'beginning', - 'method': 'split' - }) + 'method': 'split'}) def split(self, pat=None, n=-1, expand=False): result = str_split(self._data, pat, n=n) return self._wrap_result(result, expand=expand) @Appender(_shared_docs['str_split'] % { 'side': 'end', - 'method': 'rsplit' - }) + 'method': 'rsplit'}) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._data, pat, n=n) return self._wrap_result(result, expand=expand) From b11873f1345aabded09304024b1ae5d62920b2f8 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Fri, 22 Jun 2018 16:55:43 +0100 Subject: [PATCH 12/12] Fixed lint issues --- pandas/core/strings.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 7c1fef035cbf7..b27cfdfe3f1bd 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2255,7 +2255,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): Examples -------- - >>> s = pd.Series(["this is a regular sentence", "https://docs.python.org/3/tutorial/index.html", np.nan]) + >>> s = pd.Series(["this is a regular sentence", + "https://docs.python.org/3/tutorial/index.html", np.nan]) In the default setting, the string is split by whitespace. @@ -2265,7 +2266,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): 2 NaN dtype: object - Without the `n` parameter, the outputs of `rsplit` and `split` are identical. + Without the `n` parameter, the outputs of `rsplit` and `split` + are identical. >>> s.str.rsplit() 0 [this, is, a, regular, sentence] @@ -2301,10 +2303,15 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): the columns during the split. >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html None None None None - 2 NaN NaN NaN NaN NaN + 0 1 2 3 + 0 this is a regular + 1 https://docs.python.org/3/tutorial/index.html None None None + 2 NaN NaN NaN NaN \ + + 4 + 0 sentence + 1 None + 2 NaN For slightly more complex use cases like splitting the html document name from a url, a combination of parameter settings can be used.