From eefe5c127d0f2a08014803644ba199544084537d Mon Sep 17 00:00:00 2001 From: John Collins Date: Sun, 31 Dec 2023 11:26:04 +0100 Subject: [PATCH 1/4] BUG-Pyarrow-implementation-of-str.fullmatch-matches-partial-string.-Issue-#56652 changed array.py: Makes Series(["abc$abc]).str.fullmatch("abc\\$") give the same result as when dtype = pyarow.string as opposed to dtype = str. Issue reporter (Issue-#56652) requested change "//$" to "\$", but this resulted in DepreciationWarnng, so used "\\$" instead. Change test_arrow.py: updated test_str_fullmatch to account for edge cases where string starts with and ends with literal $ as well as ends with the string. --- pandas/core/arrays/arrow/array.py | 2 +- pandas/tests/extension/test_arrow.py | 20 +++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b1164301e6d79..5427cee55dfb1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2277,7 +2277,7 @@ def _str_match( def _str_fullmatch( self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None ): - if not pat.endswith("$") or pat.endswith("//$"): + if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" return self._str_match(pat, case, flags, na) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ed1b7b199a16f..153b800a62a1d 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1903,18 +1903,24 @@ def test_str_match(pat, case, na, exp): @pytest.mark.parametrize( "pat, case, na, exp", [ - ["abc", False, None, [True, None]], - ["Abc", True, None, [False, None]], - ["bc", True, None, [False, None]], - ["ab", False, True, [True, True]], - ["a[a-z]{2}", False, None, [True, None]], - ["A[a-z]{1}", True, None, [False, None]], + ["abc", False, None, [True, True, False, None]], + ["Abc", True, None, [False, False, False, None]], + ["bc", True, None, [False, False, False, None]], + ["ab", False, None, [True, True, False, None]], + ["a[a-z]{2}", False, None, [True, True, False, None]], + ["A[a-z]{1}", True, None, [False, False, False, None]], + # GH Issue: #56652 + ["abc$", False, None, [True, False, False, None]], + ["abc\\$", False, None, [False, True, False, None]], + ["Abc$", True, None, [False, False, False, None]], + ["Abc\\$", True, None, [False, False, False, None]], ], ) def test_str_fullmatch(pat, case, na, exp): - ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) result = ser.str.match(pat, case=case, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) + print(result) tm.assert_series_equal(result, expected) From ccb66f08e81695c15aad5697211644662bfaabf3 Mon Sep 17 00:00:00 2001 From: John Collins Date: Sun, 31 Dec 2023 11:42:04 +0100 Subject: [PATCH 2/4] tidy --- pandas/tests/extension/test_arrow.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 153b800a62a1d..e709e6fcfe456 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1920,7 +1920,6 @@ def test_str_fullmatch(pat, case, na, exp): ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) result = ser.str.match(pat, case=case, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) - print(result) tm.assert_series_equal(result, expected) From 36aafd1f85b87556b6b3c90bc3e51800b7621788 Mon Sep 17 00:00:00 2001 From: John Collins Date: Sun, 31 Dec 2023 19:20:27 +0100 Subject: [PATCH 3/4] Update and test string_arrow._str_fullmatch Changed: string_arrow: because it shoud be conistent with other changes. Affects circumstance where dtype="string[pyarrow]" as opposed to dtype=pd.ArrowDtype(pa.string()) Changed: test_find_replace.py: Added unit test for the edge cases where the user wants o search for a string that ends in a literal dollar sign. The string_arrow.oy updates effect this also. Question: For consistency, I formatted test_fullmatch_dollar_literal() to match other unit tests in .py - Should I instead aim to implement @pytest.mark.parametrize, which is in consistent, but more in line with developer guidelines? --- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/strings/test_find_replace.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d5a76811a12e6..e8f614ff855c0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -433,7 +433,7 @@ def _str_match( def _str_fullmatch( self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None ): - if not pat.endswith("$") or pat.endswith("//$"): + if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" return self._str_match(pat, case, flags, na) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 3f58c6d703f8f..cd4707ac405de 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -730,6 +730,15 @@ def test_fullmatch(any_string_dtype): tm.assert_series_equal(result, expected) +def test_fullmatch_dollar_literal(any_string_dtype): + # GH 56652 + ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) + result = ser.str.fullmatch("foo\\$") + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected = Series([False, False, np.nan, True], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + def test_fullmatch_na_kwarg(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype From c0848449e2f88ced21cac5eeaf5d7feca982c58e Mon Sep 17 00:00:00 2001 From: John Collins Date: Wed, 3 Jan 2024 08:47:30 +0100 Subject: [PATCH 4/4] Update v2.2.0.rst --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 649ad37a56b35..fe1d669ecf9e4 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -797,6 +797,7 @@ Strings - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) +- Bug in :meth:`str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`) - Bug in comparison operations for ``dtype="string[pyarrow_numpy]"`` raising if dtypes can't be compared (:issue:`56008`) Interval