diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 2e4d0fecaf5cf..234c12ce79822 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -641,21 +641,40 @@ You can check whether elements contain a pattern: .. ipython:: python pattern = r'[0-9][a-z]' - pd.Series(['1', '2', '3a', '3b', '03c'], + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], dtype="string").str.contains(pattern) Or whether elements match a pattern: .. ipython:: python - pd.Series(['1', '2', '3a', '3b', '03c'], + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], dtype="string").str.match(pattern) -The distinction between ``match`` and ``contains`` is strictness: ``match`` -relies on strict ``re.match``, while ``contains`` relies on ``re.search``. +.. versionadded:: 1.1.0 -Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take -an extra ``na`` argument so missing values can be considered True or False: +.. ipython:: python + + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], + dtype="string").str.fullmatch(pattern) + +.. note:: + + The distinction between ``match``, ``fullmatch``, and ``contains`` is strictness: + ``fullmatch`` tests whether the entire string matches the regular expression; + ``match`` tests whether there is a match of the regular expression that begins + at the first character of the string; and ``contains`` tests whether there is + a match of the regular expression at any position within the string. + + The corresponding functions in the ``re`` package for these three match modes are + `re.fullmatch `_, + `re.match `_, and + `re.search `_, + respectively. + +Methods like ``match``, ``fullmatch``, ``contains``, ``startswith``, and +``endswith`` take an extra ``na`` argument so missing values can be considered +True or False: .. ipython:: python diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 692df075f25cb..c50908619a340 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -69,6 +69,7 @@ Other enhancements - `OptionError` is now exposed in `pandas.errors` (:issue:`27553`) - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) +- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 7f26c7a26d4d8..8ed4fd1b8e340 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union import warnings import numpy as np @@ -10,7 +10,7 @@ import pandas._libs.lib as lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops -from pandas._typing import ArrayLike, Dtype +from pandas._typing import ArrayLike, Dtype, Scalar from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -787,9 +787,15 @@ def rep(x, r): return result -def str_match(arr, pat, case=True, flags=0, na=np.nan): +def str_match( + arr: ArrayLike, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, +): """ - Determine if each string matches a regular expression. + Determine if each string starts with a match of a regular expression. Parameters ---------- @@ -808,6 +814,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): See Also -------- + fullmatch : Stricter matching that requires the entire string to match. contains : Analogous, but less strict, relying on re.search instead of re.match. extract : Extract matched groups. @@ -823,6 +830,50 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): return _na_map(f, arr, na, dtype=dtype) +def str_fullmatch( + arr: ArrayLike, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, +): + """ + Determine if each string entirely matches a regular expression. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : default NaN + Fill value for missing values. + + Returns + ------- + Series/array of boolean values + + See Also + -------- + match : Similar, but also returns `True` when only a *prefix* of the string + matches the regular expression. + extract : Extract matched groups. + """ + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + dtype = bool + f = lambda x: regex.fullmatch(x) is not None + + return _na_map(f, arr, na, dtype=dtype) + + def _get_single_group_name(rx): try: return list(rx.groupindex.keys()).pop() @@ -2762,6 +2813,12 @@ def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) + @copy(str_fullmatch) + @forbid_nonstring_types(["bytes"]) + def fullmatch(self, pat, case=True, flags=0, na=np.nan): + result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + @copy(str_replace) @forbid_nonstring_types(["bytes"]) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 6abf174aa7fd2..6289c2efea7f1 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -41,6 +41,7 @@ def assert_series_or_index_equal(left, right): ("join", (",",), {}), ("ljust", (10,), {}), ("match", ("a",), {}), + ("fullmatch", ("a",), {}), ("normalize", ("NFC",), {}), ("pad", (10,), {}), ("partition", (" ",), {"expand": False}), @@ -1176,9 +1177,9 @@ def test_match(self): exp = Series([True, np.nan, False]) tm.assert_series_equal(result, exp) - values = Series(["fooBAD__barBAD", np.nan, "foo"]) + values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) result = values.str.match(".*BAD[_]+.*BAD") - exp = Series([True, np.nan, False]) + exp = Series([True, True, np.nan, False]) tm.assert_series_equal(result, exp) # mixed @@ -1208,6 +1209,22 @@ def test_match(self): exp = Series([True, np.nan, np.nan]) tm.assert_series_equal(exp, res) + def test_fullmatch(self): + # GH 32806 + values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) + result = values.str.fullmatch(".*BAD[_]+.*BAD") + exp = Series([True, False, np.nan, False]) + tm.assert_series_equal(result, exp) + + # Make sure that the new string arrays work + string_values = Series( + ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype="string" + ) + result = string_values.str.fullmatch(".*BAD[_]+.*BAD") + # Result is nullable boolean with StringDtype + string_exp = Series([True, False, np.nan, False], dtype="boolean") + tm.assert_series_equal(result, string_exp) + def test_extract_expand_None(self): values = Series(["fooBAD__barBAD", np.nan, "foo"]) with pytest.raises(ValueError, match="expand must be True or False"): @@ -3384,6 +3401,9 @@ def test_match_findall_flags(self): result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] + result = data.str.fullmatch(pat, flags=re.IGNORECASE) + assert result[0] + result = data.str.findall(pat, flags=re.IGNORECASE) assert result[0][0] == ("dave", "google", "com")