Skip to content

Commit 9199345

Browse files
JackCollins91pull[bot]
authored andcommitted
Bug pyarrow implementation of str.fullmatch matches partial string. issue #56652 (#56691)
* BUG-Pyarrow-implementation-of-str.fullmatch-matches-partial-string.-Issue-#56652 changed array.py: Makes Series(["abc$abc]).str.fullmatch("abc\\$") give the same result as when dtype = pyarow.string as opposed to dtype = str. Issue reporter (Issue-#56652) requested change "//$" to "\$", but this resulted in DepreciationWarnng, so used "\\$" instead. Change test_arrow.py: updated test_str_fullmatch to account for edge cases where string starts with and ends with literal $ as well as ends with the string. * tidy * Update and test string_arrow._str_fullmatch Changed: string_arrow: because it shoud be conistent with other changes. Affects circumstance where dtype="string[pyarrow]" as opposed to dtype=pd.ArrowDtype(pa.string()) Changed: test_find_replace.py: Added unit test for the edge cases where the user wants o search for a string that ends in a literal dollar sign. The string_arrow.oy updates effect this also. Question: For consistency, I formatted test_fullmatch_dollar_literal() to match other unit tests in .py - Should I instead aim to implement @pytest.mark.parametrize, which is in consistent, but more in line with developer guidelines? * Update v2.2.0.rst
1 parent da260e8 commit 9199345

File tree

5 files changed

+24
-9
lines changed

5 files changed

+24
-9
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,7 @@ Strings
805805
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`)
806806
- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`)
807807
- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`)
808+
- Bug in :meth:`str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`)
808809
- Bug in comparison operations for ``dtype="string[pyarrow_numpy]"`` raising if dtypes can't be compared (:issue:`56008`)
809810

810811
Interval

pandas/core/arrays/arrow/array.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2296,7 +2296,7 @@ def _str_match(
22962296
def _str_fullmatch(
22972297
self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
22982298
) -> Self:
2299-
if not pat.endswith("$") or pat.endswith("//$"):
2299+
if not pat.endswith("$") or pat.endswith("\\$"):
23002300
pat = f"{pat}$"
23012301
return self._str_match(pat, case, flags, na)
23022302

pandas/core/arrays/string_arrow.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ def _str_match(
436436
def _str_fullmatch(
437437
self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
438438
):
439-
if not pat.endswith("$") or pat.endswith("//$"):
439+
if not pat.endswith("$") or pat.endswith("\\$"):
440440
pat = f"{pat}$"
441441
return self._str_match(pat, case, flags, na)
442442

pandas/tests/extension/test_arrow.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -1898,16 +1898,21 @@ def test_str_match(pat, case, na, exp):
18981898
@pytest.mark.parametrize(
18991899
"pat, case, na, exp",
19001900
[
1901-
["abc", False, None, [True, None]],
1902-
["Abc", True, None, [False, None]],
1903-
["bc", True, None, [False, None]],
1904-
["ab", False, True, [True, True]],
1905-
["a[a-z]{2}", False, None, [True, None]],
1906-
["A[a-z]{1}", True, None, [False, None]],
1901+
["abc", False, None, [True, True, False, None]],
1902+
["Abc", True, None, [False, False, False, None]],
1903+
["bc", True, None, [False, False, False, None]],
1904+
["ab", False, None, [True, True, False, None]],
1905+
["a[a-z]{2}", False, None, [True, True, False, None]],
1906+
["A[a-z]{1}", True, None, [False, False, False, None]],
1907+
# GH Issue: #56652
1908+
["abc$", False, None, [True, False, False, None]],
1909+
["abc\\$", False, None, [False, True, False, None]],
1910+
["Abc$", True, None, [False, False, False, None]],
1911+
["Abc\\$", True, None, [False, False, False, None]],
19071912
],
19081913
)
19091914
def test_str_fullmatch(pat, case, na, exp):
1910-
ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
1915+
ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string()))
19111916
result = ser.str.match(pat, case=case, na=na)
19121917
expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
19131918
tm.assert_series_equal(result, expected)

pandas/tests/strings/test_find_replace.py

+9
Original file line numberDiff line numberDiff line change
@@ -730,6 +730,15 @@ def test_fullmatch(any_string_dtype):
730730
tm.assert_series_equal(result, expected)
731731

732732

733+
def test_fullmatch_dollar_literal(any_string_dtype):
734+
# GH 56652
735+
ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype)
736+
result = ser.str.fullmatch("foo\\$")
737+
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
738+
expected = Series([False, False, np.nan, True], dtype=expected_dtype)
739+
tm.assert_series_equal(result, expected)
740+
741+
733742
def test_fullmatch_na_kwarg(any_string_dtype):
734743
ser = Series(
735744
["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype

0 commit comments

Comments
 (0)