From 9b6c0548da182faede95845691c0b46080c541e7 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 31 Mar 2021 10:44:47 +0100
Subject: [PATCH 01/11] ArrowStringArrayMixin

---
 pandas/_libs/lib.pyx                      |  1 +
 pandas/conftest.py                        | 20 ++++++
 pandas/core/arrays/string_arrow.py        |  4 +-
 pandas/core/strings/__init__.py           |  2 +
 pandas/core/strings/arrow_array.py        | 83 +++++++++++++++++++++++
 pandas/tests/strings/test_string_array.py | 20 +++---
 6 files changed, 119 insertions(+), 11 deletions(-)
 create mode 100644 pandas/core/strings/arrow_array.py

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 94a4d586b4f13..8f496713aced4 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1110,6 +1110,7 @@ _TYPE_MAP = {
     "complex128": "complex",
     "c": "complex",
     "string": "string",
+    "arrow_string": "string",
     "S": "bytes",
     "U": "string",
     "bool": "boolean",
diff --git a/pandas/conftest.py b/pandas/conftest.py
index f3356d2998ff8..0a64273321d29 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1131,6 +1131,26 @@ def string_dtype(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        "string",
+        pytest.param(
+            "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+        ),
+    ]
+)
+def nullable_string_dtype(request):
+    """
+    Parametrized fixture for string dtypes.
+
+    * 'string'
+    * 'arrow_string'
+    """
+    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa
+
+    return request.param
+
+
 @pytest.fixture(params=tm.BYTES_DTYPES)
 def bytes_dtype(request):
     """
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 7251faee333bb..28a23c47b7877 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -43,6 +43,7 @@
     check_array_indexer,
     validate_indices,
 )
+from pandas.core.strings.arrow_array import ArrowStringArrayMixin
 
 try:
     import pyarrow as pa
@@ -153,7 +154,7 @@ def __eq__(self, other) -> bool:
             return False
 
 
-class ArrowStringArray(OpsMixin, ExtensionArray):
+class ArrowStringArray(OpsMixin, ExtensionArray, ArrowStringArrayMixin):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
 
@@ -198,6 +199,7 @@ class ArrowStringArray(OpsMixin, ExtensionArray):
     """
 
     _dtype = ArrowStringDtype()
+    _str_na_value = ArrowStringDtype.na_value
 
     def __init__(self, values):
         self._chk_pyarrow_available()
diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py
index 943686fc85a05..bd27c8c504a47 100644
--- a/pandas/core/strings/__init__.py
+++ b/pandas/core/strings/__init__.py
@@ -25,6 +25,8 @@
 #     - StringArray
 #     - PandasArray
 #     - Categorical
+#     - ArrowStringArrayMixin
+#       - ArrowStringArray
 
 from pandas.core.strings.accessor import StringMethods
 from pandas.core.strings.base import BaseStringArrayMethods
diff --git a/pandas/core/strings/arrow_array.py b/pandas/core/strings/arrow_array.py
new file mode 100644
index 0000000000000..c50dcc198031f
--- /dev/null
+++ b/pandas/core/strings/arrow_array.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._typing import Dtype
+
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_integer_dtype,
+    is_object_dtype,
+    is_string_dtype,
+)
+
+from pandas.core.missing import isna
+from pandas.core.strings.object_array import ObjectStringArrayMixin
+
+
+class ArrowStringArrayMixin(ObjectStringArrayMixin):
+    """
+    String Methods operating on string type PyArrow arrays.
+    """
+
+    # ------------------------------------------------------------------------
+    # String methods interface
+
+    def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
+        from pandas.arrays import (
+            BooleanArray,
+            IntegerArray,
+            StringArray,
+        )
+        from pandas.core.arrays.string_ import StringDtype
+
+        if dtype is None:
+            dtype = StringDtype()
+        if na_value is None:
+            na_value = self.dtype.na_value
+
+        mask = isna(self)
+        arr = np.asarray(self)
+
+        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+            constructor: type[IntegerArray] | type[BooleanArray]
+            if is_integer_dtype(dtype):
+                constructor = IntegerArray
+            else:
+                constructor = BooleanArray
+
+            na_value_is_na = isna(na_value)
+            if na_value_is_na:
+                na_value = 1
+            result = lib.map_infer_mask(
+                arr,
+                f,
+                mask.view("uint8"),
+                convert=False,
+                na_value=na_value,
+                # error: Value of type variable "_DTypeScalar" of "dtype" cannot be
+                # "object"
+                # error: Argument 1 to "dtype" has incompatible type
+                # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
+                # "Type[object]"
+                dtype=np.dtype(dtype),  # type: ignore[type-var,arg-type]
+            )
+
+            if not na_value_is_na:
+                mask[:] = False
+
+            return constructor(result, mask)
+
+        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
+            # i.e. StringDtype
+            result = lib.map_infer_mask(
+                arr, f, mask.view("uint8"), convert=False, na_value=na_value
+            )
+            return StringArray(result)
+        else:
+            # This is when the result type is object. We reach this when
+            # -> We know the result type is truly object (e.g. .encode returns bytes
+            #    or .findall returns a list).
+            # -> We don't know the result type. E.g. `.get` can return anything.
+            return lib.map_infer_mask(arr, f, mask.view("uint8"))
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
index b51132caf7573..a74692f3ae13b 100644
--- a/pandas/tests/strings/test_string_array.py
+++ b/pandas/tests/strings/test_string_array.py
@@ -11,14 +11,14 @@
 )
 
 
-def test_string_array(any_string_method):
+def test_string_array(nullable_string_dtype, any_string_method):
     method_name, args, kwargs = any_string_method
     if method_name == "decode":
         pytest.skip("decode requires bytes.")
 
     data = ["a", "bb", np.nan, "ccc"]
     a = Series(data, dtype=object)
-    b = Series(data, dtype="string")
+    b = Series(data, dtype=nullable_string_dtype)
 
     expected = getattr(a.str, method_name)(*args, **kwargs)
     result = getattr(b.str, method_name)(*args, **kwargs)
@@ -60,8 +60,8 @@ def test_string_array(any_string_method):
         ("rindex", [2, None]),
     ],
 )
-def test_string_array_numeric_integer_array(method, expected):
-    s = Series(["aba", None], dtype="string")
+def test_string_array_numeric_integer_array(nullable_string_dtype, method, expected):
+    s = Series(["aba", None], dtype=nullable_string_dtype)
     result = getattr(s.str, method)("a")
     expected = Series(expected, dtype="Int64")
     tm.assert_series_equal(result, expected)
@@ -76,17 +76,17 @@ def test_string_array_numeric_integer_array(method, expected):
         ("isdigit", [False, None, True]),
     ],
 )
-def test_string_array_boolean_array(method, expected):
-    s = Series(["a", None, "1"], dtype="string")
+def test_string_array_boolean_array(nullable_string_dtype, method, expected):
+    s = Series(["a", None, "1"], dtype=nullable_string_dtype)
     result = getattr(s.str, method)()
     expected = Series(expected, dtype="boolean")
     tm.assert_series_equal(result, expected)
 
 
-def test_string_array_extract():
+def test_string_array_extract(nullable_string_dtype):
     # https://github.com/pandas-dev/pandas/issues/30969
     # Only expand=False & multiple groups was failing
-    a = Series(["a1", "b2", "cc"], dtype="string")
+    a = Series(["a1", "b2", "cc"], dtype=nullable_string_dtype)
     b = Series(["a1", "b2", "cc"], dtype="object")
     pat = r"(\w)(\d)"
 
@@ -98,8 +98,8 @@ def test_string_array_extract():
     tm.assert_equal(result, expected)
 
 
-def test_str_get_stringarray_multiple_nans():
-    s = Series(pd.array(["a", "ab", pd.NA, "abc"]))
+def test_str_get_stringarray_multiple_nans(nullable_string_dtype):
+    s = Series(pd.array(["a", "ab", pd.NA, "abc"], dtype=nullable_string_dtype))
     result = s.str.get(2)
     expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"]))
     tm.assert_series_equal(result, expected)

From c0fedcd83296e00092daf240761febbd119676cb Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 31 Mar 2021 11:05:51 +0100
Subject: [PATCH 02/11] move _str_map to ArrowStringArray

---
 pandas/core/arrays/string_arrow.py | 72 ++++++++++++++++++++++++--
 pandas/core/strings/__init__.py    |  3 +-
 pandas/core/strings/arrow_array.py | 83 ------------------------------
 3 files changed, 70 insertions(+), 88 deletions(-)
 delete mode 100644 pandas/core/strings/arrow_array.py

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 28a23c47b7877..043fbba2fd6f9 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -26,6 +26,10 @@
 from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.common import (
+    is_object_dtype,
+    is_string_dtype,
+)
 from pandas.core.dtypes.dtypes import register_extension_dtype
 from pandas.core.dtypes.missing import isna
 
@@ -43,7 +47,7 @@
     check_array_indexer,
     validate_indices,
 )
-from pandas.core.strings.arrow_array import ArrowStringArrayMixin
+from pandas.core.strings.object_array import ObjectStringArrayMixin
 
 try:
     import pyarrow as pa
@@ -154,7 +158,7 @@ def __eq__(self, other) -> bool:
             return False
 
 
-class ArrowStringArray(OpsMixin, ExtensionArray, ArrowStringArrayMixin):
+class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
 
@@ -199,7 +203,6 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ArrowStringArrayMixin):
     """
 
     _dtype = ArrowStringDtype()
-    _str_na_value = ArrowStringDtype.na_value
 
     def __init__(self, values):
         self._chk_pyarrow_available()
@@ -682,3 +685,66 @@ def value_counts(self, dropna: bool = True) -> Series:
             raise NotImplementedError("yo")
 
         return Series(counts, index=index).astype("Int64")
+
+    # ------------------------------------------------------------------------
+    # String methods interface
+
+    _str_na_value = ArrowStringDtype.na_value
+
+    def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
+        from pandas.arrays import (
+            BooleanArray,
+            IntegerArray,
+            StringArray,
+        )
+        from pandas.core.arrays.string_ import StringDtype
+
+        if dtype is None:
+            dtype = StringDtype()
+        if na_value is None:
+            na_value = self.dtype.na_value
+
+        mask = isna(self)
+        arr = np.asarray(self)
+
+        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+            constructor: type[IntegerArray] | type[BooleanArray]
+            if is_integer_dtype(dtype):
+                constructor = IntegerArray
+            else:
+                constructor = BooleanArray
+
+            na_value_is_na = isna(na_value)
+            if na_value_is_na:
+                na_value = 1
+            result = lib.map_infer_mask(
+                arr,
+                f,
+                mask.view("uint8"),
+                convert=False,
+                na_value=na_value,
+                # error: Value of type variable "_DTypeScalar" of "dtype" cannot be
+                # "object"
+                # error: Argument 1 to "dtype" has incompatible type
+                # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
+                # "Type[object]"
+                dtype=np.dtype(dtype),  # type: ignore[type-var,arg-type]
+            )
+
+            if not na_value_is_na:
+                mask[:] = False
+
+            return constructor(result, mask)
+
+        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
+            # i.e. StringDtype
+            result = lib.map_infer_mask(
+                arr, f, mask.view("uint8"), convert=False, na_value=na_value
+            )
+            return StringArray(result)
+        else:
+            # This is when the result type is object. We reach this when
+            # -> We know the result type is truly object (e.g. .encode returns bytes
+            #    or .findall returns a list).
+            # -> We don't know the result type. E.g. `.get` can return anything.
+            return lib.map_infer_mask(arr, f, mask.view("uint8"))
diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py
index bd27c8c504a47..28aba7c9ce0b3 100644
--- a/pandas/core/strings/__init__.py
+++ b/pandas/core/strings/__init__.py
@@ -25,8 +25,7 @@
 #     - StringArray
 #     - PandasArray
 #     - Categorical
-#     - ArrowStringArrayMixin
-#       - ArrowStringArray
+#     - ArrowStringArray
 
 from pandas.core.strings.accessor import StringMethods
 from pandas.core.strings.base import BaseStringArrayMethods
diff --git a/pandas/core/strings/arrow_array.py b/pandas/core/strings/arrow_array.py
deleted file mode 100644
index c50dcc198031f..0000000000000
--- a/pandas/core/strings/arrow_array.py
+++ /dev/null
@@ -1,83 +0,0 @@
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import Dtype
-
-from pandas.core.dtypes.common import (
-    is_bool_dtype,
-    is_integer_dtype,
-    is_object_dtype,
-    is_string_dtype,
-)
-
-from pandas.core.missing import isna
-from pandas.core.strings.object_array import ObjectStringArrayMixin
-
-
-class ArrowStringArrayMixin(ObjectStringArrayMixin):
-    """
-    String Methods operating on string type PyArrow arrays.
-    """
-
-    # ------------------------------------------------------------------------
-    # String methods interface
-
-    def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
-        from pandas.arrays import (
-            BooleanArray,
-            IntegerArray,
-            StringArray,
-        )
-        from pandas.core.arrays.string_ import StringDtype
-
-        if dtype is None:
-            dtype = StringDtype()
-        if na_value is None:
-            na_value = self.dtype.na_value
-
-        mask = isna(self)
-        arr = np.asarray(self)
-
-        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
-            constructor: type[IntegerArray] | type[BooleanArray]
-            if is_integer_dtype(dtype):
-                constructor = IntegerArray
-            else:
-                constructor = BooleanArray
-
-            na_value_is_na = isna(na_value)
-            if na_value_is_na:
-                na_value = 1
-            result = lib.map_infer_mask(
-                arr,
-                f,
-                mask.view("uint8"),
-                convert=False,
-                na_value=na_value,
-                # error: Value of type variable "_DTypeScalar" of "dtype" cannot be
-                # "object"
-                # error: Argument 1 to "dtype" has incompatible type
-                # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
-                # "Type[object]"
-                dtype=np.dtype(dtype),  # type: ignore[type-var,arg-type]
-            )
-
-            if not na_value_is_na:
-                mask[:] = False
-
-            return constructor(result, mask)
-
-        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
-            # i.e. StringDtype
-            result = lib.map_infer_mask(
-                arr, f, mask.view("uint8"), convert=False, na_value=na_value
-            )
-            return StringArray(result)
-        else:
-            # This is when the result type is object. We reach this when
-            # -> We know the result type is truly object (e.g. .encode returns bytes
-            #    or .findall returns a list).
-            # -> We don't know the result type. E.g. `.get` can return anything.
-            return lib.map_infer_mask(arr, f, mask.view("uint8"))

From e6033216fcf76f2e6d73d7f096e38620576b69aa Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 31 Mar 2021 11:35:12 +0100
Subject: [PATCH 03/11] test_str_get_stringarray_multiple_nans

---
 pandas/core/arrays/string_arrow.py        | 6 ++----
 pandas/tests/strings/test_string_array.py | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 043fbba2fd6f9..4dbbb58581179 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -695,12 +695,10 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
         from pandas.arrays import (
             BooleanArray,
             IntegerArray,
-            StringArray,
         )
-        from pandas.core.arrays.string_ import StringDtype
 
         if dtype is None:
-            dtype = StringDtype()
+            dtype = self.dtype
         if na_value is None:
             na_value = self.dtype.na_value
 
@@ -741,7 +739,7 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
             result = lib.map_infer_mask(
                 arr, f, mask.view("uint8"), convert=False, na_value=na_value
             )
-            return StringArray(result)
+            return self._from_sequence(result)
         else:
             # This is when the result type is object. We reach this when
             # -> We know the result type is truly object (e.g. .encode returns bytes
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
index a74692f3ae13b..5408c89b8ddcb 100644
--- a/pandas/tests/strings/test_string_array.py
+++ b/pandas/tests/strings/test_string_array.py
@@ -101,5 +101,5 @@ def test_string_array_extract(nullable_string_dtype):
 def test_str_get_stringarray_multiple_nans(nullable_string_dtype):
     s = Series(pd.array(["a", "ab", pd.NA, "abc"], dtype=nullable_string_dtype))
     result = s.str.get(2)
-    expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"]))
+    expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"], dtype=nullable_string_dtype))
     tm.assert_series_equal(result, expected)

From 37035b43621b5f311e540d36ed016942427ad892 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 31 Mar 2021 11:38:46 +0100
Subject: [PATCH 04/11] flake code

---
 pandas/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 0a64273321d29..03a6b610acdb8 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1146,7 +1146,7 @@ def nullable_string_dtype(request):
     * 'string'
     * 'arrow_string'
     """
-    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa
+    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
 
     return request.param
 

From c823953694aac48fc3839ce61bd6db985bf26e61 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 31 Mar 2021 14:07:47 +0100
Subject: [PATCH 05/11] test assertions

---
 pandas/tests/strings/test_string_array.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
index 5408c89b8ddcb..dabd3798317c8 100644
--- a/pandas/tests/strings/test_string_array.py
+++ b/pandas/tests/strings/test_string_array.py
@@ -27,7 +27,7 @@ def test_string_array(nullable_string_dtype, any_string_method):
         if expected.dtype == "object" and lib.is_string_array(
             expected.dropna().values,
         ):
-            assert result.dtype == "string"
+            assert result.dtype == nullable_string_dtype
             result = result.astype(object)
 
         elif expected.dtype == "object" and lib.is_bool_array(
@@ -46,7 +46,7 @@ def test_string_array(nullable_string_dtype, any_string_method):
 
     elif isinstance(expected, DataFrame):
         columns = expected.select_dtypes(include="object").columns
-        assert all(result[columns].dtypes == "string")
+        assert all(result[columns].dtypes == nullable_string_dtype)
         result[columns] = result[columns].astype(object)
     tm.assert_equal(result, expected)
 
@@ -73,7 +73,6 @@ def test_string_array_numeric_integer_array(nullable_string_dtype, method, expec
         ("isdigit", [False, None, True]),
         ("isalpha", [True, None, False]),
         ("isalnum", [True, None, True]),
-        ("isdigit", [False, None, True]),
     ],
 )
 def test_string_array_boolean_array(nullable_string_dtype, method, expected):
@@ -92,7 +91,7 @@ def test_string_array_extract(nullable_string_dtype):
 
     result = a.str.extract(pat, expand=False)
     expected = b.str.extract(pat, expand=False)
-    assert all(result.dtypes == "string")
+    assert all(result.dtypes == nullable_string_dtype)
 
     result = result.astype(object)
     tm.assert_equal(result, expected)

From 76292ecf6047dc19ce23800e11975af2c9035d36 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 31 Mar 2021 16:33:26 +0100
Subject: [PATCH 06/11] remove xfail from test_string_methods

---
 pandas/tests/arrays/string_/test_string.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 8b84a510c01e6..749f3d0aee8a5 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -91,17 +91,11 @@ def test_setitem_with_scalar_string(dtype):
 @pytest.mark.parametrize(
     "input, method",
     [
-        (["a", "b", "c"], operator.methodcaller("capitalize")),
         (["a", "b", "c"], operator.methodcaller("capitalize")),
         (["a b", "a bc. de"], operator.methodcaller("capitalize")),
     ],
 )
-def test_string_methods(input, method, dtype, request):
-    if dtype == "arrow_string":
-        reason = "AttributeError: 'ArrowStringDtype' object has no attribute 'base'"
-        mark = pytest.mark.xfail(reason=reason)
-        request.node.add_marker(mark)
-
+def test_string_methods(input, method, dtype):
     a = pd.Series(input, dtype=dtype)
     b = pd.Series(input, dtype="object")
     result = method(a.str)

From 50db8765df093a59e644ac3d283ecc00a7f6f52d Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 31 Mar 2021 16:54:33 +0100
Subject: [PATCH 07/11] xfail extract/extractall tests - out of scope for this
 PR

---
 pandas/tests/strings/test_string_array.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
index dabd3798317c8..e7cf5085d2532 100644
--- a/pandas/tests/strings/test_string_array.py
+++ b/pandas/tests/strings/test_string_array.py
@@ -11,11 +11,19 @@
 )
 
 
-def test_string_array(nullable_string_dtype, any_string_method):
+def test_string_array(nullable_string_dtype, any_string_method, request):
     method_name, args, kwargs = any_string_method
     if method_name == "decode":
         pytest.skip("decode requires bytes.")
 
+    if nullable_string_dtype == "arrow_string" and method_name in {
+        "extract",
+        "extractall",
+    }:
+        reason = "extract/extractall does not yet dispatch to array"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
     data = ["a", "bb", np.nan, "ccc"]
     a = Series(data, dtype=object)
     b = Series(data, dtype=nullable_string_dtype)
@@ -82,9 +90,15 @@ def test_string_array_boolean_array(nullable_string_dtype, method, expected):
     tm.assert_series_equal(result, expected)
 
 
-def test_string_array_extract(nullable_string_dtype):
+def test_string_array_extract(nullable_string_dtype, request):
     # https://github.com/pandas-dev/pandas/issues/30969
     # Only expand=False & multiple groups was failing
+
+    if nullable_string_dtype == "arrow_string":
+        reason = "extract does not yet dispatch to array"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
     a = Series(["a1", "b2", "cc"], dtype=nullable_string_dtype)
     b = Series(["a1", "b2", "cc"], dtype="object")
     pat = r"(\w)(\d)"

From 374924b1a3c354f28c908c2da4fa688e58446cf2 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 31 Mar 2021 18:44:19 +0100
Subject: [PATCH 08/11] special case in _wrap_result

---
 pandas/core/strings/accessor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 1eda06dbbb1c4..6cace87924d40 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -154,10 +154,11 @@ class StringMethods(NoNewAttributesMixin):
 
     def __init__(self, data):
         from pandas.core.arrays.string_ import StringDtype
+        from pandas.core.arrays.string_arrow import ArrowStringDtype
 
         self._inferred_dtype = self._validate(data)
         self._is_categorical = is_categorical_dtype(data.dtype)
-        self._is_string = isinstance(data.dtype, StringDtype)
+        self._is_string = isinstance(data.dtype, (StringDtype, ArrowStringDtype))
         self._data = data
 
         self._index = self._name = None
@@ -316,7 +317,7 @@ def cons_row(x):
             # This is a mess.
             dtype: Optional[str]
             if self._is_string and returns_string:
-                dtype = "string"
+                dtype = self._orig.dtype
             else:
                 dtype = None
 

From 4b86f67cff53f97ded913c29df089312a979ea49 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 1 Apr 2021 16:39:28 +0100
Subject: [PATCH 09/11] add isnumeric to test_string_array_boolean_array

---
 pandas/tests/strings/test_string_array.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
index e7cf5085d2532..23c9b14c5a36a 100644
--- a/pandas/tests/strings/test_string_array.py
+++ b/pandas/tests/strings/test_string_array.py
@@ -81,6 +81,7 @@ def test_string_array_numeric_integer_array(nullable_string_dtype, method, expec
         ("isdigit", [False, None, True]),
         ("isalpha", [True, None, False]),
         ("isalnum", [True, None, True]),
+        ("isnumeric", [False, None, True]),
     ],
 )
 def test_string_array_boolean_array(nullable_string_dtype, method, expected):

From aaf54ca8d69187c0230060e7602b32c2f84cdee5 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 9 Apr 2021 12:12:12 +0100
Subject: [PATCH 10/11] mypy fixup

---
 pandas/core/arrays/string_arrow.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index bfc980efaeb06..3f24aabb81dd2 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -728,7 +728,11 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
             if not na_value_is_na:
                 mask[:] = False
 
-            return constructor(result, mask)
+            # error: Argument 1 to "IntegerArray" has incompatible type
+            # "Union[ExtensionArray, ndarray]"; expected "ndarray"
+            # error: Argument 1 to "BooleanArray" has incompatible type
+            # "Union[ExtensionArray, ndarray]"; expected "ndarray"
+            return constructor(result, mask)  # type: ignore[arg-type]
 
         elif is_string_dtype(dtype) and not is_object_dtype(dtype):
             # i.e. StringDtype

From d1c8a3e0fbe1fa71bee93eab1a001ff6a047f427 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 15 Apr 2021 08:55:30 +0100
Subject: [PATCH 11/11] add comments

---
 pandas/core/arrays/string_arrow.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 3f24aabb81dd2..fd47597b2191f 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -154,6 +154,11 @@ def __eq__(self, other) -> bool:
             return False
 
 
+# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
+# ObjectStringArrayMixin because we want to have the object-dtype based methods as
+# fallback for the ones that pyarrow doesn't yet support
+
+
 class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
@@ -688,6 +693,9 @@ def value_counts(self, dropna: bool = True) -> Series:
     _str_na_value = ArrowStringDtype.na_value
 
     def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
+        # TODO: de-duplicate with StringArray method. This method is moreless copy and
+        # paste.
+
         from pandas.arrays import (
             BooleanArray,
             IntegerArray,