From 69229d7fa8b19701b9bf8624dee92a4979d7f161 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 5 May 2021 11:41:08 +0100
Subject: [PATCH] [ArrowStringArray] PERF: use pa.compute.match_substring_regex
 for str.match if available

---
 pandas/core/arrays/string_arrow.py        |  8 +++
 pandas/core/strings/base.py               |  6 +-
 pandas/core/strings/object_array.py       |  6 +-
 pandas/tests/strings/test_find_replace.py | 74 ++++++++++++++++-------
 4 files changed, 63 insertions(+), 31 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 44298401d02cb..e48de531db86c 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -19,6 +19,7 @@
     Dtype,
     NpDtype,
     PositionalIndexer,
+    Scalar,
     type_t,
 )
 from pandas.util._decorators import doc
@@ -808,6 +809,13 @@ def _str_endswith(self, pat, na=None):
         else:
             return super()._str_endswith(pat, na)
 
+    def _str_match(
+        self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None
+    ):
+        if not pat.startswith("^"):
+            pat = "^" + pat
+        return self._str_contains(pat, case, flags, na, regex=True)
+
     def _str_isalnum(self):
         result = pc.utf8_is_alnum(self._data)
         return BooleanDtype().__from_arrow__(result)
diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py
index b8033668aa18f..a77f8861a7c02 100644
--- a/pandas/core/strings/base.py
+++ b/pandas/core/strings/base.py
@@ -61,11 +61,7 @@ def _str_repeat(self, repeats):
 
     @abc.abstractmethod
     def _str_match(
-        self,
-        pat: Union[str, Pattern],
-        case: bool = True,
-        flags: int = 0,
-        na: Scalar = np.nan,
+        self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan
     ):
         pass
 
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 0d8db3d3778a3..869eabc76b555 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -186,11 +186,7 @@ def rep(x, r):
             return result
 
     def _str_match(
-        self,
-        pat: Union[str, Pattern],
-        case: bool = True,
-        flags: int = 0,
-        na: Scalar = None,
+        self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None
     ):
         if not case:
             flags |= re.IGNORECASE
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index 404a7aaf3c9a9..06a7c6d56a61d 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -409,19 +409,39 @@ def test_replace_literal(any_string_dtype):
         values.str.replace(compiled_pat, "", regex=False)
 
 
-def test_match():
+def test_match(any_string_dtype):
     # New match behavior introduced in 0.13
-    values = Series(["fooBAD__barBAD", np.nan, "foo"])
+    expected_dtype = "object" if any_string_dtype == "object" else "boolean"
+
+    values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
     result = values.str.match(".*(BAD[_]+).*(BAD)")
-    exp = Series([True, np.nan, False])
-    tm.assert_series_equal(result, exp)
+    expected = Series([True, np.nan, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
 
-    values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"])
+    values = Series(
+        ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
+    )
     result = values.str.match(".*BAD[_]+.*BAD")
-    exp = Series([True, True, np.nan, False])
-    tm.assert_series_equal(result, exp)
+    expected = Series([True, True, np.nan, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
 
-    # mixed
+    result = values.str.match("BAD[_]+.*BAD")
+    expected = Series([False, True, np.nan, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    values = Series(
+        ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
+    )
+    result = values.str.match("^BAD[_]+.*BAD")
+    expected = Series([False, False, np.nan, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = values.str.match("\\^BAD[_]+.*BAD")
+    expected = Series([False, True, np.nan, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_match_mixed_object():
     mixed = Series(
         [
             "aBAD_BAD",
@@ -435,22 +455,34 @@ def test_match():
             2.0,
         ]
     )
-    rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)")
-    xp = Series([True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan])
-    assert isinstance(rs, Series)
-    tm.assert_series_equal(rs, xp)
+    result = Series(mixed).str.match(".*(BAD[_]+).*(BAD)")
+    expected = Series(
+        [True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan]
+    )
+    assert isinstance(result, Series)
+    tm.assert_series_equal(result, expected)
+
 
-    # na GH #6609
-    res = Series(["a", 0, np.nan]).str.match("a", na=False)
-    exp = Series([True, False, False])
-    tm.assert_series_equal(exp, res)
-    res = Series(["a", 0, np.nan]).str.match("a")
-    exp = Series([True, np.nan, np.nan])
-    tm.assert_series_equal(exp, res)
+def test_match_na_kwarg(any_string_dtype):
+    # GH #6609
+    s = Series(["a", "b", np.nan], dtype=any_string_dtype)
 
-    values = Series(["ab", "AB", "abc", "ABC"])
+    result = s.str.match("a", na=False)
+    expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
+    expected = Series([True, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.match("a")
+    expected_dtype = "object" if any_string_dtype == "object" else "boolean"
+    expected = Series([True, False, np.nan], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_match_case_kwarg(any_string_dtype):
+    values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
     result = values.str.match("ab", case=False)
-    expected = Series([True, True, True, True])
+    expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
+    expected = Series([True, True, True, True], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)