From 69229d7fa8b19701b9bf8624dee92a4979d7f161 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 5 May 2021 11:41:08 +0100 Subject: [PATCH] [ArrowStringArray] PERF: use pa.compute.match_substring_regex for str.match if available --- pandas/core/arrays/string_arrow.py | 8 +++ pandas/core/strings/base.py | 6 +- pandas/core/strings/object_array.py | 6 +- pandas/tests/strings/test_find_replace.py | 74 ++++++++++++++++------- 4 files changed, 63 insertions(+), 31 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 44298401d02cb..e48de531db86c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -19,6 +19,7 @@ Dtype, NpDtype, PositionalIndexer, + Scalar, type_t, ) from pandas.util._decorators import doc @@ -808,6 +809,13 @@ def _str_endswith(self, pat, na=None): else: return super()._str_endswith(pat, na) + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None + ): + if not pat.startswith("^"): + pat = "^" + pat + return self._str_contains(pat, case, flags, na, regex=True) + def _str_isalnum(self): result = pc.utf8_is_alnum(self._data) return BooleanDtype().__from_arrow__(result) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index b8033668aa18f..a77f8861a7c02 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -61,11 +61,7 @@ def _str_repeat(self, repeats): @abc.abstractmethod def _str_match( - self, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, + self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan ): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 0d8db3d3778a3..869eabc76b555 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -186,11 +186,7 @@ def rep(x, r): return result def _str_match( - self, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = None, + self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None ): if not case: flags |= re.IGNORECASE diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 404a7aaf3c9a9..06a7c6d56a61d 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -409,19 +409,39 @@ def test_replace_literal(any_string_dtype): values.str.replace(compiled_pat, "", regex=False) -def test_match(): +def test_match(any_string_dtype): # New match behavior introduced in 0.13 - values = Series(["fooBAD__barBAD", np.nan, "foo"]) + expected_dtype = "object" if any_string_dtype == "object" else "boolean" + + values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") - exp = Series([True, np.nan, False]) - tm.assert_series_equal(result, exp) + expected = Series([True, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) - values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) + values = Series( + ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype + ) result = values.str.match(".*BAD[_]+.*BAD") - exp = Series([True, True, np.nan, False]) - tm.assert_series_equal(result, exp) + expected = Series([True, True, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) - # mixed + result = values.str.match("BAD[_]+.*BAD") + expected = Series([False, True, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + values = Series( + ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype + ) + result = values.str.match("^BAD[_]+.*BAD") + expected = Series([False, False, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + result = values.str.match("\\^BAD[_]+.*BAD") + expected = Series([False, True, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + +def test_match_mixed_object(): mixed = Series( [ "aBAD_BAD", @@ -435,22 +455,34 @@ def test_match(): 2.0, ] ) - rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") - xp = Series([True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_series_equal(rs, xp) + result = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") + expected = Series( + [True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan] + ) + assert isinstance(result, Series) + tm.assert_series_equal(result, expected) + - # na GH #6609 - res = Series(["a", 0, np.nan]).str.match("a", na=False) - exp = Series([True, False, False]) - tm.assert_series_equal(exp, res) - res = Series(["a", 0, np.nan]).str.match("a") - exp = Series([True, np.nan, np.nan]) - tm.assert_series_equal(exp, res) +def test_match_na_kwarg(any_string_dtype): + # GH #6609 + s = Series(["a", "b", np.nan], dtype=any_string_dtype) - values = Series(["ab", "AB", "abc", "ABC"]) + result = s.str.match("a", na=False) + expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected = Series([True, False, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + result = s.str.match("a") + expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected = Series([True, False, np.nan], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + +def test_match_case_kwarg(any_string_dtype): + values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected = Series([True, True, True, True]) + expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected)