Skip to content

Commit bed9103

Browse files
authored
[ENH] Add "fullmatch" matching mode to Series.str [#32806] (#32807)
1 parent 08fce67 commit bed9103

File tree

4 files changed

+109
-12
lines changed

4 files changed

+109
-12
lines changed

doc/source/user_guide/text.rst

+25-6
Original file line numberDiff line numberDiff line change
@@ -641,21 +641,40 @@ You can check whether elements contain a pattern:
641641
.. ipython:: python
642642
643643
pattern = r'[0-9][a-z]'
644-
pd.Series(['1', '2', '3a', '3b', '03c'],
644+
pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
645645
dtype="string").str.contains(pattern)
646646
647647
Or whether elements match a pattern:
648648

649649
.. ipython:: python
650650
651-
pd.Series(['1', '2', '3a', '3b', '03c'],
651+
pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
652652
dtype="string").str.match(pattern)
653653
654-
The distinction between ``match`` and ``contains`` is strictness: ``match``
655-
relies on strict ``re.match``, while ``contains`` relies on ``re.search``.
654+
.. versionadded:: 1.1.0
656655

657-
Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
658-
an extra ``na`` argument so missing values can be considered True or False:
656+
.. ipython:: python
657+
658+
pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
659+
dtype="string").str.fullmatch(pattern)
660+
661+
.. note::
662+
663+
The distinction between ``match``, ``fullmatch``, and ``contains`` is strictness:
664+
``fullmatch`` tests whether the entire string matches the regular expression;
665+
``match`` tests whether there is a match of the regular expression that begins
666+
at the first character of the string; and ``contains`` tests whether there is
667+
a match of the regular expression at any position within the string.
668+
669+
The corresponding functions in the ``re`` package for these three match modes are
670+
`re.fullmatch <https://docs.python.org/3/library/re.html#re.fullmatch>`_,
671+
`re.match <https://docs.python.org/3/library/re.html#re.match>`_, and
672+
`re.search <https://docs.python.org/3/library/re.html#re.search>`_,
673+
respectively.
674+
675+
Methods like ``match``, ``fullmatch``, ``contains``, ``startswith``, and
676+
``endswith`` take an extra ``na`` argument so missing values can be considered
677+
True or False:
659678

660679
.. ipython:: python
661680

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ Other enhancements
6969
- `OptionError` is now exposed in `pandas.errors` (:issue:`27553`)
7070
- :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`)
7171
- Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`)
72+
- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
7273
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
7374
-
7475

pandas/core/strings.py

+61-4
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
from functools import wraps
33
import re
44
import textwrap
5-
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union
5+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union
66
import warnings
77

88
import numpy as np
99

1010
import pandas._libs.lib as lib
1111
import pandas._libs.missing as libmissing
1212
import pandas._libs.ops as libops
13-
from pandas._typing import ArrayLike, Dtype
13+
from pandas._typing import ArrayLike, Dtype, Scalar
1414
from pandas.util._decorators import Appender
1515

1616
from pandas.core.dtypes.common import (
@@ -787,9 +787,15 @@ def rep(x, r):
787787
return result
788788

789789

790-
def str_match(arr, pat, case=True, flags=0, na=np.nan):
790+
def str_match(
791+
arr: ArrayLike,
792+
pat: Union[str, Pattern],
793+
case: bool = True,
794+
flags: int = 0,
795+
na: Scalar = np.nan,
796+
):
791797
"""
792-
Determine if each string matches a regular expression.
798+
Determine if each string starts with a match of a regular expression.
793799
794800
Parameters
795801
----------
@@ -808,6 +814,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan):
808814
809815
See Also
810816
--------
817+
fullmatch : Stricter matching that requires the entire string to match.
811818
contains : Analogous, but less strict, relying on re.search instead of
812819
re.match.
813820
extract : Extract matched groups.
@@ -823,6 +830,50 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan):
823830
return _na_map(f, arr, na, dtype=dtype)
824831

825832

833+
def str_fullmatch(
834+
arr: ArrayLike,
835+
pat: Union[str, Pattern],
836+
case: bool = True,
837+
flags: int = 0,
838+
na: Scalar = np.nan,
839+
):
840+
"""
841+
Determine if each string entirely matches a regular expression.
842+
843+
.. versionadded:: 1.1.0
844+
845+
Parameters
846+
----------
847+
pat : str
848+
Character sequence or regular expression.
849+
case : bool, default True
850+
If True, case sensitive.
851+
flags : int, default 0 (no flags)
852+
Regex module flags, e.g. re.IGNORECASE.
853+
na : default NaN
854+
Fill value for missing values.
855+
856+
Returns
857+
-------
858+
Series/array of boolean values
859+
860+
See Also
861+
--------
862+
match : Similar, but also returns `True` when only a *prefix* of the string
863+
matches the regular expression.
864+
extract : Extract matched groups.
865+
"""
866+
if not case:
867+
flags |= re.IGNORECASE
868+
869+
regex = re.compile(pat, flags=flags)
870+
871+
dtype = bool
872+
f = lambda x: regex.fullmatch(x) is not None
873+
874+
return _na_map(f, arr, na, dtype=dtype)
875+
876+
826877
def _get_single_group_name(rx):
827878
try:
828879
return list(rx.groupindex.keys()).pop()
@@ -2762,6 +2813,12 @@ def match(self, pat, case=True, flags=0, na=np.nan):
27622813
result = str_match(self._parent, pat, case=case, flags=flags, na=na)
27632814
return self._wrap_result(result, fill_value=na, returns_string=False)
27642815

2816+
@copy(str_fullmatch)
2817+
@forbid_nonstring_types(["bytes"])
2818+
def fullmatch(self, pat, case=True, flags=0, na=np.nan):
2819+
result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na)
2820+
return self._wrap_result(result, fill_value=na, returns_string=False)
2821+
27652822
@copy(str_replace)
27662823
@forbid_nonstring_types(["bytes"])
27672824
def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):

pandas/tests/test_strings.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ def assert_series_or_index_equal(left, right):
4141
("join", (",",), {}),
4242
("ljust", (10,), {}),
4343
("match", ("a",), {}),
44+
("fullmatch", ("a",), {}),
4445
("normalize", ("NFC",), {}),
4546
("pad", (10,), {}),
4647
("partition", (" ",), {"expand": False}),
@@ -1176,9 +1177,9 @@ def test_match(self):
11761177
exp = Series([True, np.nan, False])
11771178
tm.assert_series_equal(result, exp)
11781179

1179-
values = Series(["fooBAD__barBAD", np.nan, "foo"])
1180+
values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"])
11801181
result = values.str.match(".*BAD[_]+.*BAD")
1181-
exp = Series([True, np.nan, False])
1182+
exp = Series([True, True, np.nan, False])
11821183
tm.assert_series_equal(result, exp)
11831184

11841185
# mixed
@@ -1208,6 +1209,22 @@ def test_match(self):
12081209
exp = Series([True, np.nan, np.nan])
12091210
tm.assert_series_equal(exp, res)
12101211

1212+
def test_fullmatch(self):
1213+
# GH 32806
1214+
values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"])
1215+
result = values.str.fullmatch(".*BAD[_]+.*BAD")
1216+
exp = Series([True, False, np.nan, False])
1217+
tm.assert_series_equal(result, exp)
1218+
1219+
# Make sure that the new string arrays work
1220+
string_values = Series(
1221+
["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype="string"
1222+
)
1223+
result = string_values.str.fullmatch(".*BAD[_]+.*BAD")
1224+
# Result is nullable boolean with StringDtype
1225+
string_exp = Series([True, False, np.nan, False], dtype="boolean")
1226+
tm.assert_series_equal(result, string_exp)
1227+
12111228
def test_extract_expand_None(self):
12121229
values = Series(["fooBAD__barBAD", np.nan, "foo"])
12131230
with pytest.raises(ValueError, match="expand must be True or False"):
@@ -3384,6 +3401,9 @@ def test_match_findall_flags(self):
33843401
result = data.str.match(pat, flags=re.IGNORECASE)
33853402
assert result[0]
33863403

3404+
result = data.str.fullmatch(pat, flags=re.IGNORECASE)
3405+
assert result[0]
3406+
33873407
result = data.str.findall(pat, flags=re.IGNORECASE)
33883408
assert result[0][0] == ("dave", "google", "com")
33893409

0 commit comments

Comments
 (0)