Skip to content

Fix #61072: inconsistent fullmatch results with regex alternation #61343

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 55 additions & 4 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1461,16 +1461,67 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
match : Similar, but also returns `True` when only a *prefix* of the string
matches the regular expression.
extract : Extract matched groups.


Notes
-----
This method enforces consistent behavior between Python's string dtype
and PyArrow-backed string arrays when using regular expressions
containing alternation (|). For regex patterns with alternation operators,
the method ensures proper grouping by wrapping the pattern in parentheses
when using PyArrow-backed string arrays.
Examples
--------
>>> ser = pd.Series(["cat", "duck", "dove"])
>>> ser.str.fullmatch(r"d.+")
0 False
1 True
2 True
0 False
1 True
2 True
dtype: bool
Ensure consistent behavior with alternation patterns:
>>> ser = pd.Series(['asdf', 'as'], dtype='string[pyarrow]')
>>> ser.str.fullmatch(r"(as)|(as)")
0 False
1 True
dtype: bool
"""
is_pyarrow = False
arr = self._data.array
arr_type = type(arr).__name__
is_pyarrow = arr_type == "ArrowStringArray"
if not is_pyarrow:
is_pyarrow = "Arrow" in arr_type
if not is_pyarrow and hasattr(arr, "dtype"):
dtype_str = str(arr.dtype)
is_pyarrow = "pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower()
if is_pyarrow and "|" in pat:
def _is_fully_wrapped(pattern):
if not (pattern.startswith('(') and pattern.endswith(')')):
return False
inner = pattern[1:-1]
level = 0
escape = False
in_char_class = False
for char in inner:
if escape:
escape = False
continue
if char == '\\':
escape = True
elif not in_char_class and char == '[':
in_char_class = True
elif in_char_class and char == ']':
in_char_class = False
elif not in_char_class:
if char == '(':
level += 1
elif char == ')':
if level == 0:
return False
level -= 1
return level == 0
if not (pat.startswith('(') and pat.endswith(')') and
_is_fully_wrapped(pat)):
pat = f"({pat})"
result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
return self._wrap_result(result, fill_value=na, returns_string=False)

Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/strings/test_pyarrow_format_behavior.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pytest
from pandas import (
Series,
)
@pytest.mark.parametrize("dtype", ["string[pyarrow]", str])
def test_string_array(dtype):
test_series = Series(['asdf', 'as'], dtype=dtype)
regex = r'((as)|(as))'
regex2 = r'(as)|(as)'
assert list(test_series.str.fullmatch(regex)) == [False, True]
assert list(test_series.str.fullmatch(regex2)) == [False, True]
@pytest.mark.parametrize(
"data, pattern, expected",
[
(["cat", "duck", "dove"], r"d.+", [False, True, True]),
],
)
def test_string_match(data, pattern, expected):
ser = Series(data)
assert list(ser.str.fullmatch(pattern)) == expected
@pytest.mark.parametrize("dtype", ["string[pyarrow]", str])
@pytest.mark.parametrize(
"pattern, expected",
[
(r'(foo)|((as)(df)?)', [True, True, True]),
('foo|as', [False, True, True]),
],
)
def test_string_alternation_patterns(dtype, pattern, expected):
ser = Series(['asdf', 'foo', 'as'], dtype=dtype)
assert list(ser.str.fullmatch(pattern)) == expected
Loading