From effaedee2e8ebc94894de1d574896d4a7f48e676 Mon Sep 17 00:00:00 2001 From: Arjun Date: Sat, 10 Mar 2018 19:06:07 +0530 Subject: [PATCH 1/5] Modified docstring of str.extract --- pandas/core/strings.py | 103 +++++++---------------------------------- 1 file changed, 16 insertions(+), 87 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index fac607f4621a8..daae98086dae9 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -633,21 +633,22 @@ def _str_extract_frame(arr, pat, flags=0): def str_extract(arr, pat, flags=0, expand=True): r""" + Return the match object corresponding to regex `pat`. + For each subject string in the Series, extract groups from the - first match of regular expression pat. + first match of regular expression `pat`. Parameters ---------- pat : string - Regular expression pattern with capturing groups + Regular expression pattern with capturing groups. flags : int, default 0 (no flags) - re module flags, e.g. re.IGNORECASE + Re module flags, e.g. re.IGNORECASE. expand : bool, default True - * If True, return DataFrame. - * If False, return Series/Index/DataFrame. - - .. versionadded:: 0.18.0 + If True, return DataFrame, else return Series/Index/DataFrame. + + .. versionadded:: 0.18.0. Returns ------- @@ -668,7 +669,7 @@ def str_extract(arr, pat, flags=0, expand=True): A pattern with two groups will return a DataFrame with two columns. Non-matches will be NaN. - >>> s = Series(['a1', 'b2', 'c3']) + >>> s = pd.Series(['a1', 'b2', 'c3']) >>> s.str.extract(r'([ab])(\d)') 0 1 0 a 1 @@ -707,7 +708,6 @@ def str_extract(arr, pat, flags=0, expand=True): 1 2 2 NaN dtype: object - """ if not isinstance(expand, bool): raise ValueError("expand must be True or False") @@ -898,94 +898,23 @@ def str_join(arr, sep): def str_findall(arr, pat, flags=0): """ - Find all occurrences of pattern or regular expression in the Series/Index. - - Equivalent to applying :func:`re.findall` to all the elements in the - Series/Index. + Find all occurrences of pattern or regular expression in the + Series/Index. Equivalent to :func:`re.findall`. Parameters ---------- pat : string - Pattern or regular expression. - flags : int, default 0 - ``re`` module flags, e.g. `re.IGNORECASE` (default is 0, which means - no flags). + Pattern or regular expression + flags : int, default 0 (no flags) + re module flags, e.g. re.IGNORECASE Returns ------- - Series/Index of lists of strings - All non-overlapping matches of pattern or regular expression in each - string of this Series/Index. + matches : Series/Index of lists See Also -------- - count : Count occurrences of pattern or regular expression in each string - of the Series/Index. - extractall : For each string in the Series, extract groups from all matches - of regular expression and return a DataFrame with one row for each - match and one column for each group. - re.findall : The equivalent ``re`` function to all non-overlapping matches - of pattern or regular expression in string, as a list of strings. - - Examples - -------- - - >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) - - The search for the pattern 'Monkey' returns one match: - - >>> s.str.findall('Monkey') - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - On the other hand, the search for the pattern 'MONKEY' doesn't return any - match: - - >>> s.str.findall('MONKEY') - 0 [] - 1 [] - 2 [] - dtype: object - - Flags can be added to the pattern or regular expression. For instance, - to find the pattern 'MONKEY' ignoring the case: - - >>> import re - >>> s.str.findall('MONKEY', flags=re.IGNORECASE) - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - When the pattern matches more than one string in the Series, all matches - are returned: - - >>> s.str.findall('on') - 0 [on] - 1 [on] - 2 [] - dtype: object - - Regular expressions are supported too. For instance, the search for all the - strings ending with the word 'on' is shown next: - - >>> s.str.findall('on$') - 0 [on] - 1 [] - 2 [] - dtype: object - - If the pattern is found more than once in the same string, then a list of - multiple strings is returned: - - >>> s.str.findall('b') - 0 [] - 1 [] - 2 [b, b] - dtype: object - + extractall : returns DataFrame with one column per capture group """ regex = re.compile(pat, flags=flags) return _na_map(regex.findall, arr) From c9f4ac881654f4316a8329e7752b1ac573507a3d Mon Sep 17 00:00:00 2001 From: Arjun Date: Sat, 10 Mar 2018 19:23:09 +0530 Subject: [PATCH 2/5] Updated docstring of str.extract --- pandas/core/strings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index daae98086dae9..e2ac43b0b92e9 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -644,7 +644,6 @@ def str_extract(arr, pat, flags=0, expand=True): Regular expression pattern with capturing groups. flags : int, default 0 (no flags) Re module flags, e.g. re.IGNORECASE. - expand : bool, default True If True, return DataFrame, else return Series/Index/DataFrame. From 3f7aadf2e5866b4e7d7f86caa4f51079fca071d6 Mon Sep 17 00:00:00 2001 From: Arjun Date: Sat, 10 Mar 2018 19:27:39 +0530 Subject: [PATCH 3/5] Updated docstring of str.extract --- pandas/core/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index e2ac43b0b92e9..f6637585ce204 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -646,7 +646,7 @@ def str_extract(arr, pat, flags=0, expand=True): Re module flags, e.g. re.IGNORECASE. expand : bool, default True If True, return DataFrame, else return Series/Index/DataFrame. - + .. versionadded:: 0.18.0. Returns From 3900c1a2fe2d5f5981cf82b08096563a5ab0f7e5 Mon Sep 17 00:00:00 2001 From: Arjun Date: Thu, 15 Mar 2018 22:14:28 +0530 Subject: [PATCH 4/5] Minor changes to str.extract --- pandas/core/strings.py | 87 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 8 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 1c4faf2b8adb7..8bd1c136cf30a 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -703,7 +703,7 @@ def str_extract(arr, pat, flags=0, expand=True): pat : string Regular expression pattern with capturing groups. flags : int, default 0 (no flags) - Re module flags, e.g. re.IGNORECASE. + ``re`` module flags, e.g. ``re.IGNORECASE``. expand : bool, default True If True, return DataFrame, else return Series/Index/DataFrame. @@ -957,23 +957,94 @@ def str_join(arr, sep): def str_findall(arr, pat, flags=0): """ - Find all occurrences of pattern or regular expression in the - Series/Index. Equivalent to :func:`re.findall`. + Find all occurrences of pattern or regular expression in the Series/Index. + + Equivalent to applying :func:`re.findall` to all the elements in the + Series/Index. Parameters ---------- pat : string - Pattern or regular expression - flags : int, default 0 (no flags) - re module flags, e.g. re.IGNORECASE + Pattern or regular expression. + flags : int, default 0 + ``re`` module flags, e.g. `re.IGNORECASE` (default is 0, which means + no flags). Returns ------- - matches : Series/Index of lists + Series/Index of lists of strings + All non-overlapping matches of pattern or regular expression in each + string of this Series/Index. See Also -------- - extractall : returns DataFrame with one column per capture group + count : Count occurrences of pattern or regular expression in each string + of the Series/Index. + extractall : For each string in the Series, extract groups from all matches + of regular expression and return a DataFrame with one row for each + match and one column for each group. + re.findall : The equivalent ``re`` function to all non-overlapping matches + of pattern or regular expression in string, as a list of strings. + + Examples + -------- + + >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) + + The search for the pattern 'Monkey' returns one match: + + >>> s.str.findall('Monkey') + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + On the other hand, the search for the pattern 'MONKEY' doesn't return any + match: + + >>> s.str.findall('MONKEY') + 0 [] + 1 [] + 2 [] + dtype: object + + Flags can be added to the pattern or regular expression. For instance, + to find the pattern 'MONKEY' ignoring the case: + + >>> import re + >>> s.str.findall('MONKEY', flags=re.IGNORECASE) + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + When the pattern matches more than one string in the Series, all matches + are returned: + + >>> s.str.findall('on') + 0 [on] + 1 [on] + 2 [] + dtype: object + + Regular expressions are supported too. For instance, the search for all the + strings ending with the word 'on' is shown next: + + >>> s.str.findall('on$') + 0 [on] + 1 [] + 2 [] + dtype: object + + If the pattern is found more than once in the same string, then a list of + multiple strings is returned: + + >>> s.str.findall('b') + 0 [] + 1 [] + 2 [b, b] + dtype: object + """ regex = re.compile(pat, flags=flags) return _na_map(regex.findall, arr) From ec8bd444625645386a94387fb8dad94b3b1ddf6b Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 7 Jul 2018 10:21:42 -0500 Subject: [PATCH 5/5] small cleanup --- pandas/core/strings.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a79edf9a6f1ce..9028ce1a77304 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -840,7 +840,7 @@ def _str_extract_frame(arr, pat, flags=0): def str_extract(arr, pat, flags=0, expand=True): r""" - Return the match object corresponding to regex `pat`. + Extract capture groups in the regex `pat` as columns in a DataFrame. For each subject string in the Series, extract groups from the first match of regular expression `pat`. @@ -851,10 +851,13 @@ def str_extract(arr, pat, flags=0, expand=True): Regular expression pattern with capturing groups. flags : int, default 0 (no flags) ``re`` module flags, e.g. ``re.IGNORECASE``. + See :mod:`re` expand : bool, default True - If True, return DataFrame, else return Series/Index/DataFrame. + If True, return DataFrame with one column per capture group. + If False, return a Series/Index if there is one capture group + or DataFrame if there are multiple capture groups. - .. versionadded:: 0.18.0. + .. versionadded:: 0.18.0 Returns -------