Skip to content

Commit 101ebf5

Browse files
Fix pandas-dev#61072: inconsistent fullmatch results with regex alternation in PyArrow
1 parent 40a8180 commit 101ebf5

File tree

2 files changed

+108
-9
lines changed

2 files changed

+108
-9
lines changed

pandas/core/strings/accessor.py

+81-9
Original file line numberDiff line numberDiff line change
@@ -1429,12 +1429,12 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
14291429
def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
14301430
"""
14311431
Determine if each string entirely matches a regular expression.
1432-
1432+
14331433
Checks if each string in the Series or Index fully matches the
14341434
specified regular expression pattern. This function is useful when the
14351435
requirement is for an entire string to conform to a pattern, such as
14361436
validating formats like phone numbers or email addresses.
1437-
1437+
14381438
Parameters
14391439
----------
14401440
pat : str
@@ -1448,32 +1448,104 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
14481448
array. For object-dtype, ``numpy.nan`` is used. For the nullable
14491449
``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype,
14501450
``False`` is used.
1451-
1451+
14521452
Returns
14531453
-------
14541454
Series/Index/array of boolean values
14551455
The function returns a Series, Index, or array of boolean values,
14561456
where True indicates that the entire string matches the regular
14571457
expression pattern and False indicates that it does not.
1458-
1458+
14591459
See Also
14601460
--------
14611461
match : Similar, but also returns `True` when only a *prefix* of the string
14621462
matches the regular expression.
14631463
extract : Extract matched groups.
1464-
1464+
1465+
Notes
1466+
-----
1467+
This method enforces consistent behavior between Python's string dtype
1468+
and PyArrow-backed string arrays when using regular expressions
1469+
containing alternation (|). For regex patterns with alternation operators,
1470+
the method ensures proper grouping by wrapping the pattern in parentheses
1471+
when using PyArrow-backed string arrays.
1472+
14651473
Examples
14661474
--------
14671475
>>> ser = pd.Series(["cat", "duck", "dove"])
14681476
>>> ser.str.fullmatch(r"d.+")
1469-
0 False
1470-
1 True
1471-
2 True
1477+
0 False
1478+
1 True
1479+
2 True
1480+
dtype: bool
1481+
1482+
Ensure consistent behavior with alternation patterns:
1483+
1484+
>>> ser = pd.Series(['asdf', 'as'], dtype='string[pyarrow]')
1485+
>>> ser.str.fullmatch(r"(as)|(as)")
1486+
0 False
1487+
1 True
14721488
dtype: bool
14731489
"""
1490+
1491+
is_pyarrow = False
1492+
1493+
arr = self._data.array
1494+
1495+
arr_type = type(arr).__name__
1496+
is_pyarrow = arr_type == "ArrowStringArray"
1497+
1498+
if not is_pyarrow:
1499+
is_pyarrow = "Arrow" in arr_type
1500+
1501+
if not is_pyarrow and hasattr(arr, "dtype"):
1502+
dtype_str = str(arr.dtype)
1503+
is_pyarrow = "pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower()
1504+
1505+
# Handle pattern modification for PyArrow implementation
1506+
if is_pyarrow and "|" in pat:
1507+
def _is_fully_wrapped(pattern):
1508+
if not (pattern.startswith('(') and pattern.endswith(')')):
1509+
return False
1510+
1511+
inner = pattern[1:-1]
1512+
1513+
level = 0
1514+
escape = False
1515+
in_char_class = False
1516+
1517+
for char in inner:
1518+
if escape:
1519+
escape = False
1520+
continue
1521+
1522+
if char == '\\':
1523+
escape = True
1524+
elif not in_char_class and char == '[':
1525+
in_char_class = True
1526+
elif in_char_class and char == ']':
1527+
in_char_class = False
1528+
elif not in_char_class:
1529+
if char == '(':
1530+
level += 1
1531+
elif char == ')':
1532+
if level == 0:
1533+
# Found a closing parenthesis without matching opening one
1534+
return False
1535+
level -= 1
1536+
1537+
# If we end with zero level,
1538+
# the outer parentheses fully wrap the pattern
1539+
return level == 0
1540+
1541+
1542+
if not (pat.startswith('(') and pat.endswith(')') and
1543+
_is_fully_wrapped(pat)):
1544+
pat = f"({pat})"
1545+
14741546
result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
14751547
return self._wrap_result(result, fill_value=na, returns_string=False)
1476-
1548+
14771549
@forbid_nonstring_types(["bytes"])
14781550
def replace(
14791551
self,

pandas/tests/test.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import pandas
2+
3+
test_series = pandas.Series(['asdf', 'as'], dtype='string[pyarrow]')
4+
regex = r'((as)|(as))'
5+
regex2 = r'(as)|(as)'
6+
7+
print(test_series.str.fullmatch(regex))
8+
# False
9+
# True
10+
print(test_series.str.fullmatch(regex2))
11+
# True
12+
# True
13+
14+
test_series2 = pandas.Series(['asdf', 'as'], dtype=str)
15+
16+
print(test_series2.str.fullmatch(regex))
17+
# False
18+
# True
19+
print(test_series2.str.fullmatch(regex2))
20+
# False
21+
# True
22+
23+
ser = pandas.Series(["cat", "duck", "dove"])
24+
print(ser.str.fullmatch(r"d.+"))
25+
# False
26+
# True
27+
# True

0 commit comments

Comments
 (0)