From c2117cb9249a17712e3ff4377f4494128d343a03 Mon Sep 17 00:00:00 2001 From: frreiss Date: Wed, 18 Mar 2020 08:32:23 -0700 Subject: [PATCH 01/10] Document new functionality --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 11757e1bf14e0..92bb92e412467 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -69,7 +69,7 @@ Other enhancements - `OptionError` is now exposed in `pandas.errors` (:issue:`27553`) - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) -- +- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). .. --------------------------------------------------------------------------- From ae0f6a8022423bc76cbb6c45eed1458d18de5a31 Mon Sep 17 00:00:00 2001 From: frreiss Date: Wed, 18 Mar 2020 08:32:59 -0700 Subject: [PATCH 02/10] Add fullmatch matching mode to Series.str --- pandas/core/strings.py | 45 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index fbc87b1fdac04..4d93d0118f9f0 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -789,7 +789,7 @@ def rep(x, r): def str_match(arr, pat, case=True, flags=0, na=np.nan): """ - Determine if each string matches a regular expression. + Determine if each string starts with a match of a regular expression. Parameters ---------- @@ -808,6 +808,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): See Also -------- + fullmatch : Stricter matching that requires the entire string to match. contains : Analogous, but less strict, relying on re.search instead of re.match. extract : Extract matched groups. @@ -823,6 +824,42 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): return _na_map(f, arr, na, dtype=dtype) +def str_fullmatch(arr, pat, case=True, flags=0, na=np.nan): + """ + Determine if each string entirely matches a regular expression. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : default NaN + Fill value for missing values. + + Returns + ------- + Series/array of boolean values + + See Also + -------- + match : Similar, but also returns `True` when only a *prefix* of the string + matches the regular expression. + extract : Extract matched groups. + """ + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + dtype = bool + f = lambda x: regex.fullmatch(x) is not None + + return _na_map(f, arr, na, dtype=dtype) + + def _get_single_group_name(rx): try: return list(rx.groupindex.keys()).pop() @@ -2762,6 +2799,12 @@ def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) + @copy(str_fullmatch) + @forbid_nonstring_types(["bytes"]) + def fullmatch(self, pat, case=True, flags=0, na=np.nan): + result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + @copy(str_replace) @forbid_nonstring_types(["bytes"]) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): From 7fc5fd33019cc7fee5b5017131fb1526c30855e2 Mon Sep 17 00:00:00 2001 From: frreiss Date: Wed, 18 Mar 2020 08:33:48 -0700 Subject: [PATCH 03/10] Add tests of Series.str.fullmatch Add tests of Series.str.fullmatch Fix formatting --- pandas/tests/test_strings.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 6abf174aa7fd2..57f6be4667c5a 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -41,6 +41,7 @@ def assert_series_or_index_equal(left, right): ("join", (",",), {}), ("ljust", (10,), {}), ("match", ("a",), {}), + ("fullmatch", ("a",), {}), ("normalize", ("NFC",), {}), ("pad", (10,), {}), ("partition", (" ",), {"expand": False}), @@ -1176,9 +1177,9 @@ def test_match(self): exp = Series([True, np.nan, False]) tm.assert_series_equal(result, exp) - values = Series(["fooBAD__barBAD", np.nan, "foo"]) + values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) result = values.str.match(".*BAD[_]+.*BAD") - exp = Series([True, np.nan, False]) + exp = Series([True, True, np.nan, False]) tm.assert_series_equal(result, exp) # mixed @@ -1208,6 +1209,27 @@ def test_match(self): exp = Series([True, np.nan, np.nan]) tm.assert_series_equal(exp, res) + def test_fullmatch(self): + values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) + result = values.str.fullmatch(".*BAD[_]+.*BAD") + exp = Series([True, False, np.nan, False]) + tm.assert_series_equal(result, exp) + + # Make sure that flags work + from re import IGNORECASE + + result = values.str.fullmatch(".*Bad[_]+.*bad", flags=IGNORECASE) + tm.assert_series_equal(result, exp) + + # Make sure that the new string arrays work + string_values = Series( + ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype="string" + ) + result = string_values.str.fullmatch(".*BAD[_]+.*BAD") + # Result is nullable boolean with StringDtype + string_exp = Series([True, False, np.nan, False], dtype="boolean") + tm.assert_series_equal(result, string_exp) + def test_extract_expand_None(self): values = Series(["fooBAD__barBAD", np.nan, "foo"]) with pytest.raises(ValueError, match="expand must be True or False"): From 19f605d21fab98e828d319b19cba37ce6706211b Mon Sep 17 00:00:00 2001 From: frreiss Date: Wed, 18 Mar 2020 19:59:05 -0700 Subject: [PATCH 04/10] Clean up test cases per review comments --- pandas/tests/test_strings.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 57f6be4667c5a..a654e7ff64e00 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1215,12 +1215,6 @@ def test_fullmatch(self): exp = Series([True, False, np.nan, False]) tm.assert_series_equal(result, exp) - # Make sure that flags work - from re import IGNORECASE - - result = values.str.fullmatch(".*Bad[_]+.*bad", flags=IGNORECASE) - tm.assert_series_equal(result, exp) - # Make sure that the new string arrays work string_values = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype="string" @@ -3406,6 +3400,9 @@ def test_match_findall_flags(self): result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] + result = data.str.fullmatch(pat, flags=re.IGNORECASE) + assert result[0] + result = data.str.findall(pat, flags=re.IGNORECASE) assert result[0][0] == ("dave", "google", "com") From 16bc0118d96fae99a44200a2ae5a899c733abae5 Mon Sep 17 00:00:00 2001 From: frreiss Date: Wed, 18 Mar 2020 20:28:18 -0700 Subject: [PATCH 05/10] Update user guide and add versionadded annotation to API doc --- doc/source/user_guide/text.rst | 25 +++++++++++++++++++------ pandas/core/strings.py | 2 ++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 2e4d0fecaf5cf..f67945fc1d52a 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -641,21 +641,34 @@ You can check whether elements contain a pattern: .. ipython:: python pattern = r'[0-9][a-z]' - pd.Series(['1', '2', '3a', '3b', '03c'], + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], dtype="string").str.contains(pattern) Or whether elements match a pattern: .. ipython:: python - pd.Series(['1', '2', '3a', '3b', '03c'], + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], dtype="string").str.match(pattern) -The distinction between ``match`` and ``contains`` is strictness: ``match`` -relies on strict ``re.match``, while ``contains`` relies on ``re.search``. +.. ipython:: python + + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], + dtype="string").str.fullmatch(pattern) + +.. versionadded:: 1.1.0 + +The distinction between ``match``, ``fullmatch``, and ``contains`` is strictness: +``fullmatch`` tests whether the entire string matches the regular expression; +``match`` tests whether there is a match of the regular expression that begins +at the first character of the string; and ``contains`` tests whether there is +a match of the regular expression at any position within the string. The +corresponding functions in the ``re`` package for these three match modes are +``re.fullmatch``, ``re.match``, and ``re.search``, respectively. -Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take -an extra ``na`` argument so missing values can be considered True or False: +Methods like ``match``, ``fullmatch``, ``contains``, ``startswith``, and +``endswith`` take an extra ``na`` argument so missing values can be considered +True or False: .. ipython:: python diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 4d93d0118f9f0..07f4287617a70 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -839,6 +839,8 @@ def str_fullmatch(arr, pat, case=True, flags=0, na=np.nan): na : default NaN Fill value for missing values. + .. versionadded:: 1.1.0 + Returns ------- Series/array of boolean values From 3aadc20386709aea91fb714d113069e865ba7585 Mon Sep 17 00:00:00 2001 From: frreiss Date: Thu, 19 Mar 2020 12:38:43 -0700 Subject: [PATCH 06/10] Add type hints to fullmatch and match --- pandas/core/strings.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 07f4287617a70..9853aec07b8b3 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union, Pattern import warnings import numpy as np @@ -10,7 +10,7 @@ import pandas._libs.lib as lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops -from pandas._typing import ArrayLike, Dtype +from pandas._typing import ArrayLike, Dtype, Scalar from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -787,7 +787,13 @@ def rep(x, r): return result -def str_match(arr, pat, case=True, flags=0, na=np.nan): +def str_match( + arr: ArrayLike, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, +): """ Determine if each string starts with a match of a regular expression. @@ -824,7 +830,13 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): return _na_map(f, arr, na, dtype=dtype) -def str_fullmatch(arr, pat, case=True, flags=0, na=np.nan): +def str_fullmatch( + arr: ArrayLike, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, +): """ Determine if each string entirely matches a regular expression. From 80b9f6388c6cf7b2d668e7dfae1b74401d29133c Mon Sep 17 00:00:00 2001 From: frreiss Date: Thu, 19 Mar 2020 13:02:36 -0700 Subject: [PATCH 07/10] Move versionadded:: annotation before relevant code example --- doc/source/user_guide/text.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index f67945fc1d52a..62fe1cef2afc6 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -651,12 +651,13 @@ Or whether elements match a pattern: pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], dtype="string").str.match(pattern) +.. versionadded:: 1.1.0 + .. ipython:: python pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], dtype="string").str.fullmatch(pattern) -.. versionadded:: 1.1.0 The distinction between ``match``, ``fullmatch``, and ``contains`` is strictness: ``fullmatch`` tests whether the entire string matches the regular expression; From 979faee6fdaea9e183d23ba4d624060d1485a1f8 Mon Sep 17 00:00:00 2001 From: frreiss Date: Thu, 19 Mar 2020 13:41:10 -0700 Subject: [PATCH 08/10] Fix import order --- pandas/core/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 9853aec07b8b3..dfdeed15440d8 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union, Pattern +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union import warnings import numpy as np From 2f94e0c134919bf64634611676a2eda6ee3157c7 Mon Sep 17 00:00:00 2001 From: frreiss Date: Mon, 23 Mar 2020 17:30:57 -0700 Subject: [PATCH 09/10] Address review comments --- doc/source/user_guide/text.rst | 15 ++++++++------- pandas/core/strings.py | 4 ++-- pandas/tests/test_strings.py | 1 + 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 62fe1cef2afc6..d4e6ae58ddc77 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -658,14 +658,15 @@ Or whether elements match a pattern: pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], dtype="string").str.fullmatch(pattern) +.. note:: -The distinction between ``match``, ``fullmatch``, and ``contains`` is strictness: -``fullmatch`` tests whether the entire string matches the regular expression; -``match`` tests whether there is a match of the regular expression that begins -at the first character of the string; and ``contains`` tests whether there is -a match of the regular expression at any position within the string. The -corresponding functions in the ``re`` package for these three match modes are -``re.fullmatch``, ``re.match``, and ``re.search``, respectively. + The distinction between ``match``, ``fullmatch``, and ``contains`` is strictness: + ``fullmatch`` tests whether the entire string matches the regular expression; + ``match`` tests whether there is a match of the regular expression that begins + at the first character of the string; and ``contains`` tests whether there is + a match of the regular expression at any position within the string. The + corresponding functions in the ``re`` package for these three match modes are + :ref:`re.fullmatch`, :ref:`re.match`, and :ref:`re.search`, respectively. Methods like ``match``, ``fullmatch``, ``contains``, ``startswith``, and ``endswith`` take an extra ``na`` argument so missing values can be considered diff --git a/pandas/core/strings.py b/pandas/core/strings.py index dfdeed15440d8..90dd95b98cbe2 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -840,6 +840,8 @@ def str_fullmatch( """ Determine if each string entirely matches a regular expression. + .. versionadded:: 1.1.0 + Parameters ---------- pat : str @@ -851,8 +853,6 @@ def str_fullmatch( na : default NaN Fill value for missing values. - .. versionadded:: 1.1.0 - Returns ------- Series/array of boolean values diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index a654e7ff64e00..6289c2efea7f1 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1210,6 +1210,7 @@ def test_match(self): tm.assert_series_equal(exp, res) def test_fullmatch(self): + # GH 32806 values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) result = values.str.fullmatch(".*BAD[_]+.*BAD") exp = Series([True, False, np.nan, False]) From 5ff52035db7decd0a5df7bb2bc94bd423b061c0d Mon Sep 17 00:00:00 2001 From: frreiss Date: Mon, 23 Mar 2020 22:33:31 -0700 Subject: [PATCH 10/10] Second attempt at adding hyperlinks --- doc/source/user_guide/text.rst | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index d4e6ae58ddc77..234c12ce79822 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -664,9 +664,13 @@ Or whether elements match a pattern: ``fullmatch`` tests whether the entire string matches the regular expression; ``match`` tests whether there is a match of the regular expression that begins at the first character of the string; and ``contains`` tests whether there is - a match of the regular expression at any position within the string. The - corresponding functions in the ``re`` package for these three match modes are - :ref:`re.fullmatch`, :ref:`re.match`, and :ref:`re.search`, respectively. + a match of the regular expression at any position within the string. + + The corresponding functions in the ``re`` package for these three match modes are + `re.fullmatch `_, + `re.match `_, and + `re.search `_, + respectively. Methods like ``match``, ``fullmatch``, ``contains``, ``startswith``, and ``endswith`` take an extra ``na`` argument so missing values can be considered