From 2e2650332c6215a93abc9fdb37e2394c697e7d53 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 19 Apr 2021 14:21:38 +0100
Subject: [PATCH 1/4] [ArrowStringArray] Use `utf8_is_*` functions from Apache
 Arrow if available

---
 pandas/core/arrays/string_arrow.py | 64 ++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 52bdcd03d3b49..864dae999d54d 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -39,6 +39,7 @@
 from pandas.core import missing
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
+from pandas.core.arrays.boolean import BooleanDtype
 from pandas.core.indexers import (
     check_array_indexer,
     validate_indices,
@@ -752,3 +753,66 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
             #    or .findall returns a list).
             # -> We don't know the result type. E.g. `.get` can return anything.
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
+
+    def _str_isalnum(self):
+        if hasattr(pc, "utf8_is_alnum"):
+            result = pc.utf8_is_alnum(self._data)
+            return BooleanDtype().__from_arrow__(result)
+        else:
+            return super()._str_isalnum()
+
+    def _str_isalpha(self):
+        if hasattr(pc, "utf8_is_alpha"):
+            result = pc.utf8_is_alpha(self._data)
+            return BooleanDtype().__from_arrow__(result)
+        else:
+            return super()._str_isalpha()
+
+    def _str_isdecimal(self):
+        if hasattr(pc, "utf8_is_decimal"):
+            result = pc.utf8_is_decimal(self._data)
+            return BooleanDtype().__from_arrow__(result)
+        else:
+            return super()._str_isdecimal()
+
+    def _str_isdigit(self):
+        if hasattr(pc, "utf8_is_digit"):
+            result = pc.utf8_is_digit(self._data)
+            return BooleanDtype().__from_arrow__(result)
+        else:
+            return super()._str_isdigit()
+
+    def _str_islower(self):
+        if hasattr(pc, "utf8_is_lower"):
+            result = pc.utf8_is_lower(self._data)
+            return BooleanDtype().__from_arrow__(result)
+        else:
+            return super()._str_islower()
+
+    def _str_isnumeric(self):
+        if hasattr(pc, "utf8_is_numeric"):
+            result = pc.utf8_is_numeric(self._data)
+            return BooleanDtype().__from_arrow__(result)
+        else:
+            return super()._str_isnumeric()
+
+    def _str_isspace(self):
+        if hasattr(pc, "utf8_is_space"):
+            result = pc.utf8_is_space(self._data)
+            return BooleanDtype().__from_arrow__(result)
+        else:
+            return super()._str_isspace()
+
+    def _str_istitle(self):
+        if hasattr(pc, "utf8_is_title"):
+            result = pc.utf8_is_title(self._data)
+            return BooleanDtype().__from_arrow__(result)
+        else:
+            return super()._str_istitle()
+
+    def _str_isupper(self):
+        if hasattr(pc, "utf8_is_upper"):
+            result = pc.utf8_is_upper(self._data)
+            return BooleanDtype().__from_arrow__(result)
+        else:
+            return super()._str_isupper()

From 0987b0e73b69a21bf80a774b9e455693ceedad45 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 20 Apr 2021 09:11:00 +0200
Subject: [PATCH 2/4] PERF: optimize conversion from boolean Arrow array to
 masked BooleanArray

---
 pandas/core/arrays/boolean.py                   | 15 +++++++++++++--
 pandas/tests/arrays/masked/test_arrow_compat.py | 11 +++++++++--
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
index 0a0bfccc0ea15..d59f74ffd256f 100644
--- a/pandas/core/arrays/boolean.py
+++ b/pandas/core/arrays/boolean.py
@@ -122,8 +122,19 @@ def __from_arrow__(
 
         results = []
         for arr in chunks:
-            # TODO should optimize this without going through object array
-            bool_arr = BooleanArray._from_sequence(np.array(arr))
+            buflist = arr.buffers()
+            data = pyarrow.BooleanArray.from_buffers(
+                arr.type, len(arr), [None, buflist[1]], offset=arr.offset
+            ).to_numpy(zero_copy_only=False)
+            if arr.null_count != 0:
+                mask = pyarrow.BooleanArray.from_buffers(
+                    arr.type, len(arr), [None, buflist[0]], offset=arr.offset
+                ).to_numpy(zero_copy_only=False)
+                mask = ~mask
+            else:
+                mask = np.zeros(len(arr), dtype=bool)
+
+            bool_arr = BooleanArray(data, mask)
             results.append(bool_arr)
 
         return BooleanArray._concat_same_type(results)
diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py
index 8bb32dec2cc0e..ae5bec078734c 100644
--- a/pandas/tests/arrays/masked/test_arrow_compat.py
+++ b/pandas/tests/arrays/masked/test_arrow_compat.py
@@ -55,12 +55,19 @@ def test_arrow_from_arrow_uint():
 
 
 @td.skip_if_no("pyarrow", min_version="0.16.0")
-def test_arrow_sliced():
+def test_arrow_sliced(data):
     # https://github.com/pandas-dev/pandas/issues/38525
     import pyarrow as pa
 
-    df = pd.DataFrame({"a": pd.array([0, None, 2, 3, None], dtype="Int64")})
+    df = pd.DataFrame({"a": data})
     table = pa.table(df)
     result = table.slice(2, None).to_pandas()
     expected = df.iloc[2:].reset_index(drop=True)
     tm.assert_frame_equal(result, expected)
+
+    # no missing values
+    df2 = df.fillna(data[0])
+    table = pa.table(df2)
+    result = table.slice(2, None).to_pandas()
+    expected = df2.iloc[2:].reset_index(drop=True)
+    tm.assert_frame_equal(result, expected)

From 18428269616c521772b49c2378fc4627de40a9a4 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 22 Apr 2021 14:56:11 +0100
Subject: [PATCH 3/4] more testing

---
 pandas/core/strings/accessor.py           |  3 +-
 pandas/tests/strings/test_string_array.py | 17 +----
 pandas/tests/strings/test_strings.py      | 83 ++++++++++++++++-------
 3 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 0b5613e302175..85a58d3d99795 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -3002,8 +3002,9 @@ def _result_dtype(arr):
     # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
     # when the list of values is empty.
     from pandas.core.arrays.string_ import StringDtype
+    from pandas.core.arrays.string_arrow import ArrowStringDtype
 
-    if isinstance(arr.dtype, StringDtype):
+    if isinstance(arr.dtype, (StringDtype, ArrowStringDtype)):
         return arr.dtype.name
     else:
         return object
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
index 02ccb3a930557..f90d219159c7e 100644
--- a/pandas/tests/strings/test_string_array.py
+++ b/pandas/tests/strings/test_string_array.py
@@ -13,19 +13,11 @@
 )
 
 
-def test_string_array(nullable_string_dtype, any_string_method, request):
+def test_string_array(nullable_string_dtype, any_string_method):
     method_name, args, kwargs = any_string_method
     if method_name == "decode":
         pytest.skip("decode requires bytes.")
 
-    if nullable_string_dtype == "arrow_string" and method_name in {
-        "extract",
-        "extractall",
-    }:
-        reason = "extract/extractall does not yet dispatch to array"
-        mark = pytest.mark.xfail(reason=reason)
-        request.node.add_marker(mark)
-
     data = ["a", "bb", np.nan, "ccc"]
     a = Series(data, dtype=object)
     b = Series(data, dtype=nullable_string_dtype)
@@ -93,15 +85,10 @@ def test_string_array_boolean_array(nullable_string_dtype, method, expected):
     tm.assert_series_equal(result, expected)
 
 
-def test_string_array_extract(nullable_string_dtype, request):
+def test_string_array_extract(nullable_string_dtype):
     # https://github.com/pandas-dev/pandas/issues/30969
     # Only expand=False & multiple groups was failing
 
-    if nullable_string_dtype == "arrow_string":
-        reason = "extract does not yet dispatch to array"
-        mark = pytest.mark.xfail(reason=reason)
-        request.node.add_marker(mark)
-
     a = Series(["a1", "b2", "cc"], dtype=nullable_string_dtype)
     b = Series(["a1", "b2", "cc"], dtype="object")
     pat = r"(\w)(\d)"
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index 06b22f00a38cf..2a52b3ba3f9e1 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -6,6 +6,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 from pandas import (
     DataFrame,
     Index,
@@ -17,6 +19,27 @@
 import pandas._testing as tm
 
 
+@pytest.fixture(
+    params=[
+        "object",
+        "string",
+        pytest.param(
+            "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+        ),
+    ]
+)
+def any_string_dtype(request):
+    """
+    Parametrized fixture for string dtypes.
+    * 'object'
+    * 'string'
+    * 'arrow_string'
+    """
+    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
+
+    return request.param
+
+
 def assert_series_or_index_equal(left, right):
     if isinstance(left, Series):
         tm.assert_series_equal(left, right)
@@ -149,10 +172,15 @@ def test_repeat_with_null(nullable_string_dtype):
     tm.assert_series_equal(result, expected)
 
 
-def test_empty_str_methods():
-    empty_str = empty = Series(dtype=object)
-    empty_int = Series(dtype="int64")
-    empty_bool = Series(dtype=bool)
+def test_empty_str_methods(any_string_dtype):
+    empty_str = empty = Series(dtype=any_string_dtype)
+    if any_string_dtype == "object":
+        empty_int = Series(dtype="int64")
+        empty_bool = Series(dtype=bool)
+    else:
+        empty_int = Series(dtype="Int64")
+        empty_bool = Series(dtype="boolean")
+    empty_object = Series(dtype=object)
     empty_bytes = Series(dtype=object)
 
     # GH7241
@@ -184,15 +212,15 @@ def test_empty_str_methods():
     tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies())
     tm.assert_series_equal(empty_str, empty_str.str.join(""))
     tm.assert_series_equal(empty_int, empty.str.len())
-    tm.assert_series_equal(empty_str, empty_str.str.findall("a"))
+    tm.assert_series_equal(empty_object, empty_str.str.findall("a"))
     tm.assert_series_equal(empty_int, empty.str.find("a"))
     tm.assert_series_equal(empty_int, empty.str.rfind("a"))
     tm.assert_series_equal(empty_str, empty.str.pad(42))
     tm.assert_series_equal(empty_str, empty.str.center(42))
-    tm.assert_series_equal(empty_str, empty.str.split("a"))
-    tm.assert_series_equal(empty_str, empty.str.rsplit("a"))
-    tm.assert_series_equal(empty_str, empty.str.partition("a", expand=False))
-    tm.assert_series_equal(empty_str, empty.str.rpartition("a", expand=False))
+    tm.assert_series_equal(empty_object, empty.str.split("a"))
+    tm.assert_series_equal(empty_object, empty.str.rsplit("a"))
+    tm.assert_series_equal(empty_object, empty.str.partition("a", expand=False))
+    tm.assert_series_equal(empty_object, empty.str.rpartition("a", expand=False))
     tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
     tm.assert_series_equal(empty_str, empty.str.slice(step=1))
     tm.assert_series_equal(empty_str, empty.str.strip())
@@ -200,7 +228,7 @@ def test_empty_str_methods():
     tm.assert_series_equal(empty_str, empty.str.rstrip())
     tm.assert_series_equal(empty_str, empty.str.wrap(42))
     tm.assert_series_equal(empty_str, empty.str.get(0))
-    tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii"))
+    tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii"))
     tm.assert_series_equal(empty_bytes, empty.str.encode("ascii"))
     # ismethods should always return boolean (GH 29624)
     tm.assert_series_equal(empty_bool, empty.str.isalnum())
@@ -227,9 +255,9 @@ def test_empty_str_methods_to_frame():
     tm.assert_frame_equal(empty_df, empty.str.rpartition("a"))
 
 
-def test_ismethods():
+def test_ismethods(any_string_dtype):
     values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", "  "]
-    str_s = Series(values)
+    str_s = Series(values, dtype=any_string_dtype)
     alnum_e = [True, True, True, True, True, False, True, True, False, False]
     alpha_e = [True, True, True, False, False, False, True, False, False, False]
     digit_e = [False, False, False, True, False, False, False, True, False, False]
@@ -253,13 +281,14 @@ def test_ismethods():
     upper_e = [True, False, False, False, True, False, True, False, False, False]
     title_e = [True, False, True, False, True, False, False, False, False, False]
 
-    tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e))
-    tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e))
-    tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e))
-    tm.assert_series_equal(str_s.str.isspace(), Series(space_e))
-    tm.assert_series_equal(str_s.str.islower(), Series(lower_e))
-    tm.assert_series_equal(str_s.str.isupper(), Series(upper_e))
-    tm.assert_series_equal(str_s.str.istitle(), Series(title_e))
+    dtype = "bool" if any_string_dtype == "object" else "boolean"
+    tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e, dtype=dtype))
+    tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e, dtype=dtype))
+    tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e, dtype=dtype))
+    tm.assert_series_equal(str_s.str.isspace(), Series(space_e, dtype=dtype))
+    tm.assert_series_equal(str_s.str.islower(), Series(lower_e, dtype=dtype))
+    tm.assert_series_equal(str_s.str.isupper(), Series(upper_e, dtype=dtype))
+    tm.assert_series_equal(str_s.str.istitle(), Series(title_e, dtype=dtype))
 
     assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values]
     assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values]
@@ -270,28 +299,30 @@ def test_ismethods():
     assert str_s.str.istitle().tolist() == [v.istitle() for v in values]
 
 
-def test_isnumeric():
+def test_isnumeric(any_string_dtype):
     # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER
     # 0x2605: ★ not number
     # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
     # 0xFF13: ３ Em 3
     values = ["A", "3", "¼", "★", "፸", "３", "four"]
-    s = Series(values)
+    s = Series(values, dtype=any_string_dtype)
     numeric_e = [False, True, True, False, True, True, False]
     decimal_e = [False, True, False, False, False, True, False]
-    tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
-    tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
+    dtype = "bool" if any_string_dtype == "object" else "boolean"
+    tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e, dtype=dtype))
+    tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e, dtype=dtype))
 
     unicodes = ["A", "3", "¼", "★", "፸", "３", "four"]
     assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes]
     assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes]
 
     values = ["A", np.nan, "¼", "★", np.nan, "３", "four"]
-    s = Series(values)
+    s = Series(values, dtype=any_string_dtype)
     numeric_e = [False, np.nan, True, False, np.nan, True, False]
     decimal_e = [False, np.nan, False, False, np.nan, True, False]
-    tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
-    tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
+    dtype = "object" if any_string_dtype == "object" else "boolean"
+    tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e, dtype=dtype))
+    tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e, dtype=dtype))
 
 
 def test_get_dummies():

From 9e2c11bea3ec05604012f1b22668b6782437a747 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 22 Apr 2021 22:08:01 +0100
Subject: [PATCH 4/4] add benchmarks

---
 asv_bench/benchmarks/strings.py | 93 +++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 29 deletions(-)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 76257e1b40f1a..5d9b1c135d7ae 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -50,91 +50,126 @@ def peakmem_cat_frame_construction(self, dtype):
 
 
 class Methods:
-    def setup(self):
-        self.s = Series(tm.makeStringIndex(10 ** 5))
+    params = ["str", "string", "arrow_string"]
+    param_names = ["dtype"]
+
+    def setup(self, dtype):
+        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
+
+        try:
+            self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
+        except ImportError:
+            raise NotImplementedError
 
-    def time_center(self):
+    def time_center(self, dtype):
         self.s.str.center(100)
 
-    def time_count(self):
+    def time_count(self, dtype):
         self.s.str.count("A")
 
-    def time_endswith(self):
+    def time_endswith(self, dtype):
         self.s.str.endswith("A")
 
-    def time_extract(self):
+    def time_extract(self, dtype):
         with warnings.catch_warnings(record=True):
             self.s.str.extract("(\\w*)A(\\w*)")
 
-    def time_findall(self):
+    def time_findall(self, dtype):
         self.s.str.findall("[A-Z]+")
 
-    def time_find(self):
+    def time_find(self, dtype):
         self.s.str.find("[A-Z]+")
 
-    def time_rfind(self):
+    def time_rfind(self, dtype):
         self.s.str.rfind("[A-Z]+")
 
-    def time_get(self):
+    def time_get(self, dtype):
         self.s.str.get(0)
 
-    def time_len(self):
+    def time_len(self, dtype):
         self.s.str.len()
 
-    def time_join(self):
+    def time_join(self, dtype):
         self.s.str.join(" ")
 
-    def time_match(self):
+    def time_match(self, dtype):
         self.s.str.match("A")
 
-    def time_normalize(self):
+    def time_normalize(self, dtype):
         self.s.str.normalize("NFC")
 
-    def time_pad(self):
+    def time_pad(self, dtype):
         self.s.str.pad(100, side="both")
 
-    def time_partition(self):
+    def time_partition(self, dtype):
         self.s.str.partition("A")
 
-    def time_rpartition(self):
+    def time_rpartition(self, dtype):
         self.s.str.rpartition("A")
 
-    def time_replace(self):
+    def time_replace(self, dtype):
         self.s.str.replace("A", "\x01\x01")
 
-    def time_translate(self):
+    def time_translate(self, dtype):
         self.s.str.translate({"A": "\x01\x01"})
 
-    def time_slice(self):
+    def time_slice(self, dtype):
         self.s.str.slice(5, 15, 2)
 
-    def time_startswith(self):
+    def time_startswith(self, dtype):
         self.s.str.startswith("A")
 
-    def time_strip(self):
+    def time_strip(self, dtype):
         self.s.str.strip("A")
 
-    def time_rstrip(self):
+    def time_rstrip(self, dtype):
         self.s.str.rstrip("A")
 
-    def time_lstrip(self):
+    def time_lstrip(self, dtype):
         self.s.str.lstrip("A")
 
-    def time_title(self):
+    def time_title(self, dtype):
         self.s.str.title()
 
-    def time_upper(self):
+    def time_upper(self, dtype):
         self.s.str.upper()
 
-    def time_lower(self):
+    def time_lower(self, dtype):
         self.s.str.lower()
 
-    def time_wrap(self):
+    def time_wrap(self, dtype):
         self.s.str.wrap(10)
 
-    def time_zfill(self):
+    def time_zfill(self, dtype):
         self.s.str.zfill(10)
 
+    def time_isalnum(self, dtype):
+        self.s.str.isalnum()
+
+    def time_isalpha(self, dtype):
+        self.s.str.isalpha()
+
+    def time_isdecimal(self, dtype):
+        self.s.str.isdecimal()
+
+    def time_isdigit(self, dtype):
+        self.s.str.isdigit()
+
+    def time_islower(self, dtype):
+        self.s.str.islower()
+
+    def time_isnumeric(self, dtype):
+        self.s.str.isnumeric()
+
+    def time_isspace(self, dtype):
+        self.s.str.isspace()
+
+    def time_istitle(self, dtype):
+        self.s.str.istitle()
+
+    def time_isupper(self, dtype):
+        self.s.str.isupper()
+
 
 class Repeat: