Skip to content

Commit 1dc4117

Browse files
Fix pandas-dev#61072: inconsistent fullmatch results with regex alternation in PyArrow strings
Fixes an issue where regex patterns with alternation (|) produce different results between str dtype and string[pyarrow] dtype. When using patterns like "(as)|(as)", PyArrow implementation would incorrectly match "asdf" while Python's implementation correctly rejects it. The fix adds special handling to ensure alternation patterns are properly parenthesized when using PyArrow-backed strings.
1 parent b53a96a commit 1dc4117

File tree

2 files changed

+11
-9
lines changed

2 files changed

+11
-9
lines changed

pandas/core/strings/accessor.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1504,7 +1504,6 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
15041504

15051505
# Handle pattern modification for PyArrow implementation
15061506
if is_pyarrow and "|" in pat:
1507-
# For patterns with alternation, ensure they're properly grouped for PyArrow
15081507
def _is_fully_wrapped(pattern):
15091508
if not (pattern.startswith('(') and pattern.endswith(')')):
15101509
return False
@@ -1535,9 +1534,11 @@ def _is_fully_wrapped(pattern):
15351534
return False
15361535
level -= 1
15371536

1538-
# If we end with zero level, the outer parentheses fully wrap the pattern
1537+
# If we end with zero level,
1538+
# the outer parentheses fully wrap the pattern
15391539
return level == 0
15401540

1541+
15411542
if not (pat.startswith('(') and pat.endswith(')') and
15421543
_is_fully_wrapped(pat)):
15431544
pat = f"({pat})"

pandas/tests/test.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,6 @@
22
print(pandas.__version__)
33

44

5-
ser = pandas.Series(["cat", "duck", "dove"])
6-
print(ser.str.fullmatch(r"d.+"))
7-
#False
8-
#True
9-
#True
10-
115
test_series = pandas.Series(['asdf', 'as'], dtype='string[pyarrow]')
126
regex = r'((as)|(as))'
137
regex2 = r'(as)|(as)'
@@ -27,4 +21,11 @@
2721
# True
2822
print(test_series2.str.fullmatch(regex2))
2923
# False
30-
# True
24+
# True
25+
26+
27+
ser = pandas.Series(["cat", "duck", "dove"])
28+
print(ser.str.fullmatch(r"d.+"))
29+
#False
30+
#True
31+
#True

0 commit comments

Comments
 (0)