Skip to content

Commit 7e06033

Browse files
Fix pandas-dev#61072: inconsistent fullmatch results with regex alternation
in PyArrow strings Fixes an issue where regex patterns with alternation (|) produce different results between str dtype and string[pyarrow] dtype. When using patterns like "(as)|(as)", PyArrow implementation would incorrectly match "asdf" while Python's implementation correctly rejects it. The fix adds special handling to ensure alternation patterns are properly parenthesized when using PyArrow-backed strings
1 parent 40a8180 commit 7e06033

File tree

2 files changed

+85
-8
lines changed

2 files changed

+85
-8
lines changed

pandas/core/strings/accessor.py

+54-8
Original file line numberDiff line numberDiff line change
@@ -1429,12 +1429,10 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
14291429
def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
14301430
"""
14311431
Determine if each string entirely matches a regular expression.
1432-
14331432
Checks if each string in the Series or Index fully matches the
14341433
specified regular expression pattern. This function is useful when the
14351434
requirement is for an entire string to conform to a pattern, such as
14361435
validating formats like phone numbers or email addresses.
1437-
14381436
Parameters
14391437
----------
14401438
pat : str
@@ -1448,29 +1446,77 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
14481446
array. For object-dtype, ``numpy.nan`` is used. For the nullable
14491447
``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype,
14501448
``False`` is used.
1451-
14521449
Returns
14531450
-------
14541451
Series/Index/array of boolean values
14551452
The function returns a Series, Index, or array of boolean values,
14561453
where True indicates that the entire string matches the regular
14571454
expression pattern and False indicates that it does not.
1458-
14591455
See Also
14601456
--------
14611457
match : Similar, but also returns `True` when only a *prefix* of the string
14621458
matches the regular expression.
14631459
extract : Extract matched groups.
1464-
1460+
Notes
1461+
-----
1462+
This method enforces consistent behavior between Python's string dtype
1463+
and PyArrow-backed string arrays when using regular expressions
1464+
containing alternation (|). For regex patterns with alternation operators,
1465+
the method ensures proper grouping by wrapping the pattern in parentheses
1466+
when using PyArrow-backed string arrays.
14651467
Examples
14661468
--------
14671469
>>> ser = pd.Series(["cat", "duck", "dove"])
14681470
>>> ser.str.fullmatch(r"d.+")
1469-
0 False
1470-
1 True
1471-
2 True
1471+
0 False
1472+
1 True
1473+
2 True
1474+
dtype: bool
1475+
Ensure consistent behavior with alternation patterns:
1476+
>>> ser = pd.Series(['asdf', 'as'], dtype='string[pyarrow]')
1477+
>>> ser.str.fullmatch(r"(as)|(as)")
1478+
0 False
1479+
1 True
14721480
dtype: bool
14731481
"""
1482+
is_pyarrow = False
1483+
arr = self._data.array
1484+
arr_type = type(arr).__name__
1485+
is_pyarrow = arr_type == "ArrowStringArray"
1486+
if not is_pyarrow:
1487+
is_pyarrow = "Arrow" in arr_type
1488+
if not is_pyarrow and hasattr(arr, "dtype"):
1489+
dtype_str = str(arr.dtype)
1490+
is_pyarrow = "pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower()
1491+
if is_pyarrow and "|" in pat:
1492+
def _is_fully_wrapped(pattern):
1493+
if not (pattern.startswith('(') and pattern.endswith(')')):
1494+
return False
1495+
inner = pattern[1:-1]
1496+
level = 0
1497+
escape = False
1498+
in_char_class = False
1499+
for char in inner:
1500+
if escape:
1501+
escape = False
1502+
continue
1503+
if char == '\\':
1504+
escape = True
1505+
elif not in_char_class and char == '[':
1506+
in_char_class = True
1507+
elif in_char_class and char == ']':
1508+
in_char_class = False
1509+
elif not in_char_class:
1510+
if char == '(':
1511+
level += 1
1512+
elif char == ')':
1513+
if level == 0:
1514+
return False
1515+
level -= 1
1516+
return level == 0
1517+
if not (pat.startswith('(') and pat.endswith(')') and
1518+
_is_fully_wrapped(pat)):
1519+
pat = f"({pat})"
14741520
result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
14751521
return self._wrap_result(result, fill_value=na, returns_string=False)
14761522

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import pytest
2+
from pandas import (
3+
Series,
4+
)
5+
@pytest.mark.parametrize("dtype", ["string[pyarrow]", str])
6+
def test_string_array(dtype):
7+
test_series = Series(['asdf', 'as'], dtype=dtype)
8+
regex = r'((as)|(as))'
9+
regex2 = r'(as)|(as)'
10+
assert list(test_series.str.fullmatch(regex)) == [False, True]
11+
assert list(test_series.str.fullmatch(regex2)) == [False, True]
12+
@pytest.mark.parametrize(
13+
"data, pattern, expected",
14+
[
15+
(["cat", "duck", "dove"], r"d.+", [False, True, True]),
16+
],
17+
)
18+
def test_string_match(data, pattern, expected):
19+
ser = Series(data)
20+
assert list(ser.str.fullmatch(pattern)) == expected
21+
@pytest.mark.parametrize("dtype", ["string[pyarrow]", str])
22+
@pytest.mark.parametrize(
23+
"pattern, expected",
24+
[
25+
(r'(foo)|((as)(df)?)', [True, True, True]),
26+
('foo|as', [False, True, True]),
27+
],
28+
)
29+
def test_string_alternation_patterns(dtype, pattern, expected):
30+
ser = Series(['asdf', 'foo', 'as'], dtype=dtype)
31+
assert list(ser.str.fullmatch(pattern)) == expected

0 commit comments

Comments
 (0)