From c2b768b490a0cd706711ab2f6ffbaa171f505594 Mon Sep 17 00:00:00 2001 From: Pedro Santos Date: Tue, 25 Mar 2025 19:57:22 +0000 Subject: [PATCH] Fix #61072: inconsistent fullmatch results with regex alternation in PyArrow strings Fixes an issue where regex patterns with alternation (|) produce different results between str dtype and string[pyarrow] dtype. When using patterns like "(as)|(as)", PyArrow implementation would incorrectly match "asdf" while Python's implementation correctly rejects it. The fix adds special handling to ensure alternation patterns are properly parenthesized when using PyArrow-backed strings --- pandas/core/strings/accessor.py | 59 +++++++++++++++++-- .../strings/test_pyarrow_format_behavior.py | 31 ++++++++++ 2 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 pandas/tests/strings/test_pyarrow_format_behavior.py diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 81f7441846589..36e8ae994f681 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1461,16 +1461,67 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): match : Similar, but also returns `True` when only a *prefix* of the string matches the regular expression. extract : Extract matched groups. - + + Notes + ----- + This method enforces consistent behavior between Python's string dtype + and PyArrow-backed string arrays when using regular expressions + containing alternation (|). For regex patterns with alternation operators, + the method ensures proper grouping by wrapping the pattern in parentheses + when using PyArrow-backed string arrays. Examples -------- >>> ser = pd.Series(["cat", "duck", "dove"]) >>> ser.str.fullmatch(r"d.+") - 0 False - 1 True - 2 True + 0 False + 1 True + 2 True + dtype: bool + Ensure consistent behavior with alternation patterns: + >>> ser = pd.Series(['asdf', 'as'], dtype='string[pyarrow]') + >>> ser.str.fullmatch(r"(as)|(as)") + 0 False + 1 True dtype: bool """ + is_pyarrow = False + arr = self._data.array + arr_type = type(arr).__name__ + is_pyarrow = arr_type == "ArrowStringArray" + if not is_pyarrow: + is_pyarrow = "Arrow" in arr_type + if not is_pyarrow and hasattr(arr, "dtype"): + dtype_str = str(arr.dtype) + is_pyarrow = "pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower() + if is_pyarrow and "|" in pat: + def _is_fully_wrapped(pattern): + if not (pattern.startswith('(') and pattern.endswith(')')): + return False + inner = pattern[1:-1] + level = 0 + escape = False + in_char_class = False + for char in inner: + if escape: + escape = False + continue + if char == '\\': + escape = True + elif not in_char_class and char == '[': + in_char_class = True + elif in_char_class and char == ']': + in_char_class = False + elif not in_char_class: + if char == '(': + level += 1 + elif char == ')': + if level == 0: + return False + level -= 1 + return level == 0 + if not (pat.startswith('(') and pat.endswith(')') and + _is_fully_wrapped(pat)): + pat = f"({pat})" result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) diff --git a/pandas/tests/strings/test_pyarrow_format_behavior.py b/pandas/tests/strings/test_pyarrow_format_behavior.py new file mode 100644 index 0000000000000..8399326172aa4 --- /dev/null +++ b/pandas/tests/strings/test_pyarrow_format_behavior.py @@ -0,0 +1,31 @@ +import pytest +from pandas import ( + Series, +) +@pytest.mark.parametrize("dtype", ["string[pyarrow]", str]) +def test_string_array(dtype): + test_series = Series(['asdf', 'as'], dtype=dtype) + regex = r'((as)|(as))' + regex2 = r'(as)|(as)' + assert list(test_series.str.fullmatch(regex)) == [False, True] + assert list(test_series.str.fullmatch(regex2)) == [False, True] +@pytest.mark.parametrize( + "data, pattern, expected", + [ + (["cat", "duck", "dove"], r"d.+", [False, True, True]), + ], +) +def test_string_match(data, pattern, expected): + ser = Series(data) + assert list(ser.str.fullmatch(pattern)) == expected +@pytest.mark.parametrize("dtype", ["string[pyarrow]", str]) +@pytest.mark.parametrize( + "pattern, expected", + [ + (r'(foo)|((as)(df)?)', [True, True, True]), + ('foo|as', [False, True, True]), + ], +) +def test_string_alternation_patterns(dtype, pattern, expected): + ser = Series(['asdf', 'foo', 'as'], dtype=dtype) + assert list(ser.str.fullmatch(pattern)) == expected \ No newline at end of file