Skip to content

Commit c2b768b

Browse files
Fix pandas-dev#61072: inconsistent fullmatch results with regex alternation
in PyArrow strings Fixes an issue where regex patterns with alternation (|) produce different results between str dtype and string[pyarrow] dtype. When using patterns like "(as)|(as)", PyArrow implementation would incorrectly match "asdf" while Python's implementation correctly rejects it. The fix adds special handling to ensure alternation patterns are properly parenthesized when using PyArrow-backed strings
1 parent 40a8180 commit c2b768b

File tree

2 files changed

+86
-4
lines changed

2 files changed

+86
-4
lines changed

pandas/core/strings/accessor.py

+55-4
Original file line numberDiff line numberDiff line change
@@ -1461,16 +1461,67 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
14611461
match : Similar, but also returns `True` when only a *prefix* of the string
14621462
matches the regular expression.
14631463
extract : Extract matched groups.
1464-
1464+
1465+
Notes
1466+
-----
1467+
This method enforces consistent behavior between Python's string dtype
1468+
and PyArrow-backed string arrays when using regular expressions
1469+
containing alternation (|). For regex patterns with alternation operators,
1470+
the method ensures proper grouping by wrapping the pattern in parentheses
1471+
when using PyArrow-backed string arrays.
14651472
Examples
14661473
--------
14671474
>>> ser = pd.Series(["cat", "duck", "dove"])
14681475
>>> ser.str.fullmatch(r"d.+")
1469-
0 False
1470-
1 True
1471-
2 True
1476+
0 False
1477+
1 True
1478+
2 True
1479+
dtype: bool
1480+
Ensure consistent behavior with alternation patterns:
1481+
>>> ser = pd.Series(['asdf', 'as'], dtype='string[pyarrow]')
1482+
>>> ser.str.fullmatch(r"(as)|(as)")
1483+
0 False
1484+
1 True
14721485
dtype: bool
14731486
"""
1487+
is_pyarrow = False
1488+
arr = self._data.array
1489+
arr_type = type(arr).__name__
1490+
is_pyarrow = arr_type == "ArrowStringArray"
1491+
if not is_pyarrow:
1492+
is_pyarrow = "Arrow" in arr_type
1493+
if not is_pyarrow and hasattr(arr, "dtype"):
1494+
dtype_str = str(arr.dtype)
1495+
is_pyarrow = "pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower()
1496+
if is_pyarrow and "|" in pat:
1497+
def _is_fully_wrapped(pattern):
1498+
if not (pattern.startswith('(') and pattern.endswith(')')):
1499+
return False
1500+
inner = pattern[1:-1]
1501+
level = 0
1502+
escape = False
1503+
in_char_class = False
1504+
for char in inner:
1505+
if escape:
1506+
escape = False
1507+
continue
1508+
if char == '\\':
1509+
escape = True
1510+
elif not in_char_class and char == '[':
1511+
in_char_class = True
1512+
elif in_char_class and char == ']':
1513+
in_char_class = False
1514+
elif not in_char_class:
1515+
if char == '(':
1516+
level += 1
1517+
elif char == ')':
1518+
if level == 0:
1519+
return False
1520+
level -= 1
1521+
return level == 0
1522+
if not (pat.startswith('(') and pat.endswith(')') and
1523+
_is_fully_wrapped(pat)):
1524+
pat = f"({pat})"
14741525
result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
14751526
return self._wrap_result(result, fill_value=na, returns_string=False)
14761527

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import pytest
2+
from pandas import (
3+
Series,
4+
)
5+
@pytest.mark.parametrize("dtype", ["string[pyarrow]", str])
6+
def test_string_array(dtype):
7+
test_series = Series(['asdf', 'as'], dtype=dtype)
8+
regex = r'((as)|(as))'
9+
regex2 = r'(as)|(as)'
10+
assert list(test_series.str.fullmatch(regex)) == [False, True]
11+
assert list(test_series.str.fullmatch(regex2)) == [False, True]
12+
@pytest.mark.parametrize(
13+
"data, pattern, expected",
14+
[
15+
(["cat", "duck", "dove"], r"d.+", [False, True, True]),
16+
],
17+
)
18+
def test_string_match(data, pattern, expected):
19+
ser = Series(data)
20+
assert list(ser.str.fullmatch(pattern)) == expected
21+
@pytest.mark.parametrize("dtype", ["string[pyarrow]", str])
22+
@pytest.mark.parametrize(
23+
"pattern, expected",
24+
[
25+
(r'(foo)|((as)(df)?)', [True, True, True]),
26+
('foo|as', [False, True, True]),
27+
],
28+
)
29+
def test_string_alternation_patterns(dtype, pattern, expected):
30+
ser = Series(['asdf', 'foo', 'as'], dtype=dtype)
31+
assert list(ser.str.fullmatch(pattern)) == expected

0 commit comments

Comments
 (0)