Skip to content

Commit d0086b2

Browse files
Fix pandas-dev#61072: inconsistent fullmatch results with regex alternation
in PyArrow strings Fixes an issue where regex patterns with alternation (|) produce different results between str dtype and string[pyarrow] dtype. When using patterns like "(as)|(as)", PyArrow implementation would incorrectly match "asdf" while Python's implementation correctly rejects it. The fix adds special handling to ensure alternation patterns are properly parenthesized when using PyArrow-backed strings
1 parent a6d1d7f commit d0086b2

File tree

2 files changed

+33
-54
lines changed

2 files changed

+33
-54
lines changed

pandas/core/strings/accessor.py

+2-27
Original file line numberDiff line numberDiff line change
@@ -1429,12 +1429,10 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
14291429
def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
14301430
"""
14311431
Determine if each string entirely matches a regular expression.
1432-
14331432
Checks if each string in the Series or Index fully matches the
14341433
specified regular expression pattern. This function is useful when the
14351434
requirement is for an entire string to conform to a pattern, such as
14361435
validating formats like phone numbers or email addresses.
1437-
14381436
Parameters
14391437
----------
14401438
pat : str
@@ -1448,28 +1446,24 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
14481446
array. For object-dtype, ``numpy.nan`` is used. For the nullable
14491447
``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype,
14501448
``False`` is used.
1451-
14521449
Returns
14531450
-------
14541451
Series/Index/array of boolean values
14551452
The function returns a Series, Index, or array of boolean values,
14561453
where True indicates that the entire string matches the regular
14571454
expression pattern and False indicates that it does not.
1458-
14591455
See Also
14601456
--------
14611457
match : Similar, but also returns `True` when only a *prefix* of the string
14621458
matches the regular expression.
14631459
extract : Extract matched groups.
1464-
14651460
Notes
14661461
-----
14671462
This method enforces consistent behavior between Python's string dtype
14681463
and PyArrow-backed string arrays when using regular expressions
14691464
containing alternation (|). For regex patterns with alternation operators,
14701465
the method ensures proper grouping by wrapping the pattern in parentheses
14711466
when using PyArrow-backed string arrays.
1472-
14731467
Examples
14741468
--------
14751469
>>> ser = pd.Series(["cat", "duck", "dove"])
@@ -1478,46 +1472,34 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
14781472
1 True
14791473
2 True
14801474
dtype: bool
1481-
14821475
Ensure consistent behavior with alternation patterns:
1483-
14841476
>>> ser = pd.Series(['asdf', 'as'], dtype='string[pyarrow]')
14851477
>>> ser.str.fullmatch(r"(as)|(as)")
14861478
0 False
14871479
1 True
14881480
dtype: bool
14891481
"""
1490-
14911482
is_pyarrow = False
1492-
14931483
arr = self._data.array
1494-
14951484
arr_type = type(arr).__name__
14961485
is_pyarrow = arr_type == "ArrowStringArray"
1497-
14981486
if not is_pyarrow:
14991487
is_pyarrow = "Arrow" in arr_type
1500-
15011488
if not is_pyarrow and hasattr(arr, "dtype"):
15021489
dtype_str = str(arr.dtype)
15031490
is_pyarrow = "pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower()
1504-
1505-
# Handle pattern modification for PyArrow implementation
15061491
if is_pyarrow and "|" in pat:
15071492
def _is_fully_wrapped(pattern):
15081493
if not (pattern.startswith('(') and pattern.endswith(')')):
1509-
return False
1510-
1494+
return False
15111495
inner = pattern[1:-1]
15121496
level = 0
15131497
escape = False
15141498
in_char_class = False
1515-
15161499
for char in inner:
15171500
if escape:
15181501
escape = False
1519-
continue
1520-
1502+
continue
15211503
if char == '\\':
15221504
escape = True
15231505
elif not in_char_class and char == '[':
@@ -1529,19 +1511,12 @@ def _is_fully_wrapped(pattern):
15291511
level += 1
15301512
elif char == ')':
15311513
if level == 0:
1532-
# Found a closing parenthesis without matching opening one
15331514
return False
15341515
level -= 1
1535-
1536-
# If we end with zero level,
1537-
# the outer parentheses fully wrap the pattern
15381516
return level == 0
1539-
1540-
15411517
if not (pat.startswith('(') and pat.endswith(')') and
15421518
_is_fully_wrapped(pat)):
15431519
pat = f"({pat})"
1544-
15451520
result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
15461521
return self._wrap_result(result, fill_value=na, returns_string=False)
15471522

Original file line numberDiff line numberDiff line change
@@ -1,27 +1,31 @@
1-
import pandas
2-
3-
test_series = pandas.Series(['asdf', 'as'], dtype='string[pyarrow]')
4-
regex = r'((as)|(as))'
5-
regex2 = r'(as)|(as)'
6-
7-
print(test_series.str.fullmatch(regex))
8-
# False
9-
# True
10-
print(test_series.str.fullmatch(regex2))
11-
# True
12-
# True
13-
14-
test_series2 = pandas.Series(['asdf', 'as'], dtype=str)
15-
16-
print(test_series2.str.fullmatch(regex))
17-
# False
18-
# True
19-
print(test_series2.str.fullmatch(regex2))
20-
# False
21-
# True
22-
23-
ser = pandas.Series(["cat", "duck", "dove"])
24-
print(ser.str.fullmatch(r"d.+"))
25-
# False
26-
# True
27-
# True
1+
import pytest
2+
from pandas import (
3+
Series,
4+
)
5+
@pytest.mark.parametrize("dtype", ["string[pyarrow]", str])
6+
def test_string_array(dtype):
7+
test_series = Series(['asdf', 'as'], dtype=dtype)
8+
regex = r'((as)|(as))'
9+
regex2 = r'(as)|(as)'
10+
assert list(test_series.str.fullmatch(regex)) == [False, True]
11+
assert list(test_series.str.fullmatch(regex2)) == [False, True]
12+
@pytest.mark.parametrize(
13+
"data, pattern, expected",
14+
[
15+
(["cat", "duck", "dove"], r"d.+", [False, True, True]),
16+
],
17+
)
18+
def test_string_match(data, pattern, expected):
19+
ser = Series(data)
20+
assert list(ser.str.fullmatch(pattern)) == expected
21+
@pytest.mark.parametrize("dtype", ["string[pyarrow]", str])
22+
@pytest.mark.parametrize(
23+
"pattern, expected",
24+
[
25+
(r'(foo)|((as)(df)?)', [True, True, True]),
26+
('foo|as', [False, True, True]),
27+
],
28+
)
29+
def test_string_alternation_patterns(dtype, pattern, expected):
30+
ser = Series(['asdf', 'foo', 'as'], dtype=dtype)
31+
assert list(ser.str.fullmatch(pattern)) == expected

0 commit comments

Comments
 (0)