Skip to content

Commit 40b5f54

Browse files
Fix pandas-dev#61072: inconsistent fullmatch results with regex alternation
in PyArrow strings Fixes an issue where regex patterns with alternation (|) produce different results between str dtype and string[pyarrow] dtype. When using patterns like "(as)|(as)", PyArrow implementation would incorrectly match "asdf" while Python's implementation correctly rejects it. The fix adds special handling to ensure alternation patterns are properly parenthesized when using PyArrow-backed strings
1 parent a6d1d7f commit 40b5f54

File tree

2 files changed

+41
-46
lines changed

2 files changed

+41
-46
lines changed

pandas/core/strings/accessor.py

+2-19
Original file line numberDiff line numberDiff line change
@@ -1487,37 +1487,27 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
14871487
1 True
14881488
dtype: bool
14891489
"""
1490-
14911490
is_pyarrow = False
1492-
14931491
arr = self._data.array
1494-
14951492
arr_type = type(arr).__name__
14961493
is_pyarrow = arr_type == "ArrowStringArray"
1497-
14981494
if not is_pyarrow:
14991495
is_pyarrow = "Arrow" in arr_type
1500-
15011496
if not is_pyarrow and hasattr(arr, "dtype"):
15021497
dtype_str = str(arr.dtype)
15031498
is_pyarrow = "pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower()
1504-
1505-
# Handle pattern modification for PyArrow implementation
15061499
if is_pyarrow and "|" in pat:
15071500
def _is_fully_wrapped(pattern):
15081501
if not (pattern.startswith('(') and pattern.endswith(')')):
1509-
return False
1510-
1502+
return False
15111503
inner = pattern[1:-1]
15121504
level = 0
15131505
escape = False
15141506
in_char_class = False
1515-
15161507
for char in inner:
15171508
if escape:
15181509
escape = False
1519-
continue
1520-
1510+
continue
15211511
if char == '\\':
15221512
escape = True
15231513
elif not in_char_class and char == '[':
@@ -1529,19 +1519,12 @@ def _is_fully_wrapped(pattern):
15291519
level += 1
15301520
elif char == ')':
15311521
if level == 0:
1532-
# Found a closing parenthesis without matching opening one
15331522
return False
15341523
level -= 1
1535-
1536-
# If we end with zero level,
1537-
# the outer parentheses fully wrap the pattern
15381524
return level == 0
1539-
1540-
15411525
if not (pat.startswith('(') and pat.endswith(')') and
15421526
_is_fully_wrapped(pat)):
15431527
pat = f"({pat})"
1544-
15451528
result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
15461529
return self._wrap_result(result, fill_value=na, returns_string=False)
15471530

Original file line numberDiff line numberDiff line change
@@ -1,27 +1,39 @@
1-
import pandas
2-
3-
test_series = pandas.Series(['asdf', 'as'], dtype='string[pyarrow]')
4-
regex = r'((as)|(as))'
5-
regex2 = r'(as)|(as)'
6-
7-
print(test_series.str.fullmatch(regex))
8-
# False
9-
# True
10-
print(test_series.str.fullmatch(regex2))
11-
# True
12-
# True
13-
14-
test_series2 = pandas.Series(['asdf', 'as'], dtype=str)
15-
16-
print(test_series2.str.fullmatch(regex))
17-
# False
18-
# True
19-
print(test_series2.str.fullmatch(regex2))
20-
# False
21-
# True
22-
23-
ser = pandas.Series(["cat", "duck", "dove"])
24-
print(ser.str.fullmatch(r"d.+"))
25-
# False
26-
# True
27-
# True
1+
import pytest
2+
3+
from pandas import (
4+
Series,
5+
)
6+
7+
@pytest.mark.parametrize("dtype", ["string[pyarrow]", str])
8+
def test_string_array(dtype):
9+
test_series = Series(['asdf', 'as'], dtype=dtype)
10+
regex = r'((as)|(as))'
11+
regex2 = r'(as)|(as)'
12+
13+
assert list(test_series.str.fullmatch(regex)) == [False, True]
14+
assert list(test_series.str.fullmatch(regex2)) == [False, True]
15+
16+
@pytest.mark.parametrize(
17+
"data, pattern, expected",
18+
[
19+
(["cat", "duck", "dove"], r"d.+", [False, True, True]),
20+
],
21+
)
22+
def test_string_match(data, pattern, expected):
23+
ser = Series(data)
24+
assert list(ser.str.fullmatch(pattern)) == expected
25+
26+
@pytest.mark.parametrize("dtype", ["string[pyarrow]", str])
27+
@pytest.mark.parametrize(
28+
"pattern, expected",
29+
[
30+
(r'(foo)|((as)(df)?)', [True, True, True]),
31+
('foo|as', [False, True, True]),
32+
],
33+
)
34+
def test_string_alternation_patterns(dtype, pattern, expected):
35+
ser = Series(['asdf', 'foo', 'as'], dtype=dtype)
36+
assert list(ser.str.fullmatch(pattern)) == expected
37+
38+
39+

0 commit comments

Comments
 (0)