From b24afc9ba03939403e48f9d97e0b6d2a04662799 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 13 Aug 2023 14:25:27 +0200
Subject: [PATCH 01/23] Start new string array

---
 pandas/core/arrays/string_arrow.py | 95 ++++++++++++++++++++++++++----
 1 file changed, 83 insertions(+), 12 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 4a70fcf6b5a93..474351fc6b1d7 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -113,6 +113,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
     # error: Incompatible types in assignment (expression has type "StringDtype",
     # base class "ArrowExtensionArray" defined the type as "ArrowDtype")
     _dtype: StringDtype  # type: ignore[assignment]
+    _result_converter = lambda result: BooleanDtype().__from_arrow__(result)
 
     def __init__(self, values) -> None:
         super().__init__(values)
@@ -313,7 +314,7 @@ def _str_contains(
             result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case)
         else:
             result = pc.match_substring(self._pa_array, pat, ignore_case=not case)
-        result = BooleanDtype().__from_arrow__(result)
+        result = self._result_converter(result)
         if not isna(na):
             result[isna(result)] = bool(na)
         return result
@@ -322,7 +323,7 @@ def _str_startswith(self, pat: str, na=None):
         result = pc.starts_with(self._pa_array, pattern=pat)
         if not isna(na):
             result = result.fill_null(na)
-        result = BooleanDtype().__from_arrow__(result)
+        result = self._result_converter_(result)
         if not isna(na):
             result[isna(result)] = bool(na)
         return result
@@ -331,7 +332,7 @@ def _str_endswith(self, pat: str, na=None):
         result = pc.ends_with(self._pa_array, pattern=pat)
         if not isna(na):
             result = result.fill_null(na)
-        result = BooleanDtype().__from_arrow__(result)
+        result = self._result_converter(result)
         if not isna(na):
             result[isna(result)] = bool(na)
         return result
@@ -369,39 +370,39 @@ def _str_fullmatch(
 
     def _str_isalnum(self):
         result = pc.utf8_is_alnum(self._pa_array)
-        return BooleanDtype().__from_arrow__(result)
+        return self._result_converter(result)
 
     def _str_isalpha(self):
         result = pc.utf8_is_alpha(self._pa_array)
-        return BooleanDtype().__from_arrow__(result)
+        return self._result_converter(result)
 
     def _str_isdecimal(self):
         result = pc.utf8_is_decimal(self._pa_array)
-        return BooleanDtype().__from_arrow__(result)
+        return self._result_converter(result)
 
     def _str_isdigit(self):
         result = pc.utf8_is_digit(self._pa_array)
-        return BooleanDtype().__from_arrow__(result)
+        return self._result_converter(result)
 
     def _str_islower(self):
         result = pc.utf8_is_lower(self._pa_array)
-        return BooleanDtype().__from_arrow__(result)
+        return self._result_converter(result)
 
     def _str_isnumeric(self):
         result = pc.utf8_is_numeric(self._pa_array)
-        return BooleanDtype().__from_arrow__(result)
+        return self._result_converter(result)
 
     def _str_isspace(self):
         result = pc.utf8_is_space(self._pa_array)
-        return BooleanDtype().__from_arrow__(result)
+        return self._result_converter(result)
 
     def _str_istitle(self):
         result = pc.utf8_is_title(self._pa_array)
-        return BooleanDtype().__from_arrow__(result)
+        return self._result_converter(result)
 
     def _str_isupper(self):
         result = pc.utf8_is_upper(self._pa_array)
-        return BooleanDtype().__from_arrow__(result)
+        return self._result_converter(result)
 
     def _str_len(self):
         result = pc.utf8_length(self._pa_array)
@@ -433,3 +434,73 @@ def _str_rstrip(self, to_strip=None):
         else:
             result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
         return type(self)(result)
+
+
+class ArrowStringArrayNumpySemantics(ArrowStringArray):
+    _result_converter = lambda result: result.to_numpy()
+
+    def _str_len(self):
+        result = pc.utf8_length(self._pa_array)
+        return result.to_numpy()
+
+    def _str_map(
+        self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
+    ):
+        """
+        Map a callable over valid elements of the array.
+
+        Parameters
+        ----------
+        f : Callable
+            A function to call on each non-NA element.
+        na_value : Scalar, optional
+            The value to set for NA values. Might also be used for the
+            fill value if the callable `f` raises an exception.
+            This defaults to ``self._str_na_value`` which is ``np.nan``
+            for object-dtype and Categorical and ``pd.NA`` for StringArray.
+        dtype : Dtype, optional
+            The dtype of the result array.
+        convert : bool, default True
+            Whether to call `maybe_convert_objects` on the resulting ndarray
+        """
+        if dtype is None:
+            dtype = np.dtype("object")
+        if na_value is None:
+            na_value = self._str_na_value
+
+        if not len(self):
+            return np.array([], dtype=dtype)
+
+        arr = np.asarray(self, dtype=object)
+        mask = isna(arr)
+        map_convert = convert and not np.all(mask)
+        try:
+            result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)
+        except (TypeError, AttributeError) as err:
+            # Reraise the exception if callable `f` got wrong number of args.
+            # The user may want to be warned by this, instead of getting NaN
+            p_err = (
+                r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
+                r"(?(3)required )positional arguments?"
+            )
+
+            if len(err.args) >= 1 and re.search(p_err, err.args[0]):
+                # FIXME: this should be totally avoidable
+                raise err
+
+            def g(x):
+                # This type of fallback behavior can be removed once
+                # we remove object-dtype .str accessor.
+                try:
+                    return f(x)
+                except (TypeError, AttributeError):
+                    return na_value
+
+            return self._str_map(g, na_value=na_value, dtype=dtype)
+        if not isinstance(result, np.ndarray):
+            return result
+        if na_value is not np.nan:
+            np.putmask(result, mask, na_value)
+            if convert and result.dtype == object:
+                result = lib.maybe_convert_objects(result)
+        return result

From b306c6f1550279687a9d1aaa2364e6546e5d905b Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 13 Aug 2023 18:30:11 +0200
Subject: [PATCH 02/23] Add missing methods

---
 pandas/core/arrays/_arrow_string_mixins.py | 94 +++++++++++++++++++++
 pandas/core/arrays/arrow/array.py          | 95 +++-------------------
 pandas/core/arrays/string_.py              | 21 +++--
 pandas/core/arrays/string_arrow.py         | 57 ++++++++++++-
 4 files changed, 176 insertions(+), 91 deletions(-)
 create mode 100644 pandas/core/arrays/_arrow_string_mixins.py

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
new file mode 100644
index 0000000000000..62857285cadef
--- /dev/null
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -0,0 +1,94 @@
+from __future__ import annotations
+
+from typing import Literal
+
+from pandas.compat import pa_version_under7p0
+
+if not pa_version_under7p0:
+    import pyarrow as pa
+    import pyarrow.compute as pc
+
+
+class ArrowStringArrayMixin:
+    def _str_pad(
+        self,
+        width: int,
+        side: Literal["left", "right", "both"] = "left",
+        fillchar: str = " ",
+    ):
+        if side == "left":
+            pa_pad = pc.utf8_lpad
+        elif side == "right":
+            pa_pad = pc.utf8_rpad
+        elif side == "both":
+            pa_pad = pc.utf8_center
+        else:
+            raise ValueError(
+                f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
+            )
+        return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
+
+    def _str_get(self, i: int):
+        lengths = pc.utf8_length(self._pa_array)
+        if i >= 0:
+            out_of_bounds = pc.greater_equal(i, lengths)
+            start = i
+            stop = i + 1
+            step = 1
+        else:
+            out_of_bounds = pc.greater(-i, lengths)
+            start = i
+            stop = i - 1
+            step = -1
+        not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
+        selected = pc.utf8_slice_codeunits(
+            self._pa_array, start=start, stop=stop, step=step
+        )
+        null_value = pa.scalar(None, type=self._pa_array.type)
+        result = pc.if_else(not_out_of_bounds, selected, null_value)
+        return type(self)(result)
+
+    def _str_partition(self, sep: str, expand: bool):
+        predicate = lambda val: val.partition(sep)
+        result = self._apply_elementwise(predicate)
+        return type(self)(pa.chunked_array(result))
+
+    def _str_rpartition(self, sep: str, expand: bool):
+        predicate = lambda val: val.rpartition(sep)
+        result = self._apply_elementwise(predicate)
+        return type(self)(pa.chunked_array(result))
+
+    def _str_slice(
+        self, start: int | None = None, stop: int | None = None, step: int | None = None
+    ):
+        if start is None:
+            start = 0
+        if step is None:
+            step = 1
+        return type(self)(
+            pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
+        )
+
+    def _str_slice_replace(
+        self, start: int | None = None, stop: int | None = None, repl: str | None = None
+    ):
+        if repl is None:
+            repl = ""
+        if start is None:
+            start = 0
+        return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
+
+    def _str_capitalize(self):
+        return type(self)(pc.utf8_capitalize(self._pa_array))
+
+    def _str_title(self):
+        return type(self)(pc.utf8_title(self._pa_array))
+
+    def _str_swapcase(self):
+        return type(self)(pc.utf8_swapcase(self._pa_array))
+
+    def _str_removesuffix(self, suffix: str):
+        ends_with = pc.ends_with(self._pa_array, pattern=suffix)
+        removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
+        result = pc.if_else(ends_with, removed, self._pa_array)
+        return type(self)(result)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 88695f11fba59..e83ce450218a5 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -42,6 +42,7 @@
 
 from pandas.core import roperator
 from pandas.core.arraylike import OpsMixin
+from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
 from pandas.core.arrays.base import (
     ExtensionArray,
     ExtensionArraySupportsAnyAll,
@@ -184,7 +185,10 @@ def to_pyarrow_type(
 
 
 class ArrowExtensionArray(
-    OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods
+    OpsMixin,
+    ExtensionArraySupportsAnyAll,
+    ArrowStringArrayMixin,
+    BaseStringArrayMethods,
 ):
     """
     Pandas ExtensionArray backed by a PyArrow ChunkedArray.
@@ -246,6 +250,12 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
             )
         self._dtype = ArrowDtype(self._pa_array.type)
 
+    def __dir__(self):
+        o = set(dir(type(self)))
+        o.update(self.__dict__)
+        o.update(set(dir(ArrowStringArrayMixin)))
+        return list(o)
+
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
         """
@@ -1987,24 +1997,6 @@ def _str_count(self, pat: str, flags: int = 0):
             raise NotImplementedError(f"count not implemented with {flags=}")
         return type(self)(pc.count_substring_regex(self._pa_array, pat))
 
-    def _str_pad(
-        self,
-        width: int,
-        side: Literal["left", "right", "both"] = "left",
-        fillchar: str = " ",
-    ):
-        if side == "left":
-            pa_pad = pc.utf8_lpad
-        elif side == "right":
-            pa_pad = pc.utf8_rpad
-        elif side == "both":
-            pa_pad = pc.utf8_center
-        else:
-            raise ValueError(
-                f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
-            )
-        return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
-
     def _str_contains(
         self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
     ):
@@ -2089,26 +2081,6 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
             )
         return type(self)(result)
 
-    def _str_get(self, i: int):
-        lengths = pc.utf8_length(self._pa_array)
-        if i >= 0:
-            out_of_bounds = pc.greater_equal(i, lengths)
-            start = i
-            stop = i + 1
-            step = 1
-        else:
-            out_of_bounds = pc.greater(-i, lengths)
-            start = i
-            stop = i - 1
-            step = -1
-        not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
-        selected = pc.utf8_slice_codeunits(
-            self._pa_array, start=start, stop=stop, step=step
-        )
-        null_value = pa.scalar(None, type=self._pa_array.type)
-        result = pc.if_else(not_out_of_bounds, selected, null_value)
-        return type(self)(result)
-
     def _str_join(self, sep: str):
         if pa.types.is_string(self._pa_array.type):
             result = self._apply_elementwise(list)
@@ -2117,36 +2089,6 @@ def _str_join(self, sep: str):
             result = self._pa_array
         return type(self)(pc.binary_join(result, sep))
 
-    def _str_partition(self, sep: str, expand: bool):
-        predicate = lambda val: val.partition(sep)
-        result = self._apply_elementwise(predicate)
-        return type(self)(pa.chunked_array(result))
-
-    def _str_rpartition(self, sep: str, expand: bool):
-        predicate = lambda val: val.rpartition(sep)
-        result = self._apply_elementwise(predicate)
-        return type(self)(pa.chunked_array(result))
-
-    def _str_slice(
-        self, start: int | None = None, stop: int | None = None, step: int | None = None
-    ):
-        if start is None:
-            start = 0
-        if step is None:
-            step = 1
-        return type(self)(
-            pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
-        )
-
-    def _str_slice_replace(
-        self, start: int | None = None, stop: int | None = None, repl: str | None = None
-    ):
-        if repl is None:
-            repl = ""
-        if start is None:
-            start = 0
-        return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
-
     def _str_isalnum(self):
         return type(self)(pc.utf8_is_alnum(self._pa_array))
 
@@ -2171,18 +2113,9 @@ def _str_isspace(self):
     def _str_istitle(self):
         return type(self)(pc.utf8_is_title(self._pa_array))
 
-    def _str_capitalize(self):
-        return type(self)(pc.utf8_capitalize(self._pa_array))
-
-    def _str_title(self):
-        return type(self)(pc.utf8_title(self._pa_array))
-
     def _str_isupper(self):
         return type(self)(pc.utf8_is_upper(self._pa_array))
 
-    def _str_swapcase(self):
-        return type(self)(pc.utf8_swapcase(self._pa_array))
-
     def _str_len(self):
         return type(self)(pc.utf8_length(self._pa_array))
 
@@ -2223,12 +2156,6 @@ def _str_removeprefix(self, prefix: str):
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))
 
-    def _str_removesuffix(self, suffix: str):
-        ends_with = pc.ends_with(self._pa_array, pattern=suffix)
-        removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
-        result = pc.if_else(ends_with, removed, self._pa_array)
-        return type(self)(result)
-
     def _str_casefold(self):
         predicate = lambda val: val.casefold()
         result = self._apply_elementwise(predicate)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 25f1c2ec6ce4f..1e285f90e9fea 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -76,7 +76,7 @@ class StringDtype(StorageExtensionDtype):
 
     Parameters
     ----------
-    storage : {"python", "pyarrow"}, optional
+    storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
         If not given, the value of ``pd.options.mode.string_storage``.
 
     Attributes
@@ -108,11 +108,11 @@ def na_value(self) -> libmissing.NAType:
     def __init__(self, storage=None) -> None:
         if storage is None:
             storage = get_option("mode.string_storage")
-        if storage not in {"python", "pyarrow"}:
+        if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
             raise ValueError(
                 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
             )
-        if storage == "pyarrow" and pa_version_under7p0:
+        if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0:
             raise ImportError(
                 "pyarrow>=7.0.0 is required for PyArrow backed StringArray."
             )
@@ -160,6 +160,8 @@ def construct_from_string(cls, string):
             return cls(storage="python")
         elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
+        elif string == "string[pyarrow_numpy]":
+            return cls(storage="pyarrow_numpy")
         else:
             raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
 
@@ -176,12 +178,17 @@ def construct_array_type(  # type: ignore[override]
         -------
         type
         """
-        from pandas.core.arrays.string_arrow import ArrowStringArray
+        from pandas.core.arrays.string_arrow import (
+            ArrowStringArray,
+            ArrowStringArrayNumpySemantics,
+        )
 
         if self.storage == "python":
             return StringArray
-        else:
+        elif self.storage == "pyarrow":
             return ArrowStringArray
+        else:
+            return ArrowStringArrayNumpySemantics
 
     def __from_arrow__(
         self, array: pyarrow.Array | pyarrow.ChunkedArray
@@ -193,6 +200,10 @@ def __from_arrow__(
             from pandas.core.arrays.string_arrow import ArrowStringArray
 
             return ArrowStringArray(array)
+        elif self.storage == "pyarrow_numpy":
+            from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
+
+            return ArrowStringArrayNumpySemantics(array)
         else:
             import pyarrow
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 474351fc6b1d7..20dc8978b63df 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from functools import partial
 import re
 from typing import (
     TYPE_CHECKING,
@@ -27,6 +28,7 @@
 )
 from pandas.core.dtypes.missing import isna
 
+from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
 from pandas.core.arrays.arrow import ArrowExtensionArray
 from pandas.core.arrays.boolean import BooleanDtype
 from pandas.core.arrays.integer import Int64Dtype
@@ -114,10 +116,11 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
     # base class "ArrowExtensionArray" defined the type as "ArrowDtype")
     _dtype: StringDtype  # type: ignore[assignment]
     _result_converter = lambda result: BooleanDtype().__from_arrow__(result)
+    _storage = "pyarrow"
 
     def __init__(self, values) -> None:
         super().__init__(values)
-        self._dtype = StringDtype(storage="pyarrow")
+        self._dtype = StringDtype(storage=self._storage)
 
         if not pa.types.is_string(self._pa_array.type) and not (
             pa.types.is_dictionary(self._pa_array.type)
@@ -145,7 +148,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
 
         if dtype and not (isinstance(dtype, str) and dtype == "string"):
             dtype = pandas_dtype(dtype)
-            assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
+            assert isinstance(dtype, StringDtype) and dtype.storage in (
+                "pyarrow",
+                "pyarrow_numpy",
+            )
 
         if isinstance(scalars, BaseMaskedArray):
             # avoid costly conversion to object dtype in ensure_string_array and
@@ -438,6 +444,12 @@ def _str_rstrip(self, to_strip=None):
 
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
     _result_converter = lambda result: result.to_numpy()
+    _storage = "pyarrow_numpy"
+
+    def __getattribute__(self, item):
+        if item in ArrowStringArrayMixin.__dict__:
+            return partial(getattr(ArrowStringArrayMixin, item), self)
+        return super().__getattribute__(item)
 
     def _str_len(self):
         result = pc.utf8_length(self._pa_array)
@@ -504,3 +516,44 @@ def g(x):
             if convert and result.dtype == object:
                 result = lib.maybe_convert_objects(result)
         return result
+
+    def _str_count(self, pat: str, flags: int = 0):
+        if flags:
+            return super()._str_count(pat, flags)
+        return pc.count_substring_regex(self._pa_array, pat).to_numpy()
+
+    def _str_find(self, sub: str, start: int = 0, end: int | None = None):
+        if start != 0 and end is not None:
+            slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
+            result = pc.find_substring(slices, sub)
+            not_found = pc.equal(result, -1)
+            offset_result = pc.add(result, end - start)
+            result = pc.if_else(not_found, result, offset_result)
+        elif start == 0 and end is None:
+            slices = self._pa_array
+            result = pc.find_substring(slices, sub)
+        else:
+            return super()._str_find(sub, start, end)
+        return type(self)(result)
+
+    def _str_split(
+        self,
+        pat: str | None = None,
+        n: int | None = -1,
+        expand: bool = False,
+        regex: bool | None = None,
+    ):
+        if n in {-1, 0}:
+            n = None
+        if regex:
+            split_func = pc.split_pattern_regex
+        else:
+            split_func = pc.split_pattern
+        return split_func(self._pa_array, pat, max_splits=n).to_numpy()
+
+    def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
+        if n in {-1, 0}:
+            n = None
+        return pc.split_pattern(
+            self._pa_array, pat, max_splits=n, reverse=True
+        ).to_numpy()

From 2dbcfb0b91ab953ea933afecf9d2003d86873b31 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 13 Aug 2023 21:57:53 +0200
Subject: [PATCH 03/23] Implement Arrow String Array that is compatible with
 NumPy semantics

---
 pandas/conftest.py                         |   2 +
 pandas/core/arrays/_arrow_string_mixins.py |  25 +---
 pandas/core/arrays/arrow/array.py          |  26 +++-
 pandas/core/arrays/string_arrow.py         | 163 ++++++++++-----------
 pandas/core/config_init.py                 |   2 +-
 pandas/core/strings/accessor.py            |   4 +-
 pandas/tests/arrays/string_/test_string.py | 100 +++++++++----
 pandas/tests/arrays/test_datetimelike.py   |   7 +-
 pandas/tests/extension/base/methods.py     |   3 +
 pandas/tests/extension/test_string.py      |   8 +-
 pandas/tests/io/conftest.py                |  16 ++
 pandas/tests/strings/__init__.py           |   1 +
 pandas/tests/strings/test_case_justify.py  |   2 +
 pandas/tests/strings/test_find_replace.py  |  63 ++++----
 pandas/tests/strings/test_strings.py       |  18 ++-
 15 files changed, 261 insertions(+), 179 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index f756da82157b8..e0aae234144f5 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1321,6 +1321,7 @@ def nullable_string_dtype(request):
     params=[
         "python",
         pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
+        pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")),
     ]
 )
 def string_storage(request):
@@ -1380,6 +1381,7 @@ def object_dtype(request):
         "object",
         "string[python]",
         pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
     ]
 )
 def any_string_dtype(request):
diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index 62857285cadef..29ea4c6dc221d 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -2,6 +2,8 @@
 
 from typing import Literal
 
+import numpy as np
+
 from pandas.compat import pa_version_under7p0
 
 if not pa_version_under7p0:
@@ -48,27 +50,6 @@ def _str_get(self, i: int):
         result = pc.if_else(not_out_of_bounds, selected, null_value)
         return type(self)(result)
 
-    def _str_partition(self, sep: str, expand: bool):
-        predicate = lambda val: val.partition(sep)
-        result = self._apply_elementwise(predicate)
-        return type(self)(pa.chunked_array(result))
-
-    def _str_rpartition(self, sep: str, expand: bool):
-        predicate = lambda val: val.rpartition(sep)
-        result = self._apply_elementwise(predicate)
-        return type(self)(pa.chunked_array(result))
-
-    def _str_slice(
-        self, start: int | None = None, stop: int | None = None, step: int | None = None
-    ):
-        if start is None:
-            start = 0
-        if step is None:
-            step = 1
-        return type(self)(
-            pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
-        )
-
     def _str_slice_replace(
         self, start: int | None = None, stop: int | None = None, repl: str | None = None
     ):
@@ -76,6 +57,8 @@ def _str_slice_replace(
             repl = ""
         if start is None:
             start = 0
+        if stop is None:
+            stop = np.iinfo(np.int64).max
         return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
 
     def _str_capitalize(self):
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index e83ce450218a5..d4c68c6870681 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -512,7 +512,10 @@ def __getitem__(self, item: PositionalIndexer):
         if isinstance(item, np.ndarray):
             if not len(item):
                 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
-                if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
+                if self._dtype.name == "string" and self._dtype.storage in (
+                    "pyarrow",
+                    "pyarrow_numpy",
+                ):
                     pa_dtype = pa.string()
                 else:
                     pa_dtype = self._dtype.pyarrow_dtype
@@ -1997,6 +2000,27 @@ def _str_count(self, pat: str, flags: int = 0):
             raise NotImplementedError(f"count not implemented with {flags=}")
         return type(self)(pc.count_substring_regex(self._pa_array, pat))
 
+    def _str_slice(
+        self, start: int | None = None, stop: int | None = None, step: int | None = None
+    ):
+        if start is None:
+            start = 0
+        if step is None:
+            step = 1
+        return type(self)(
+            pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
+        )
+
+    def _str_partition(self, sep: str, expand: bool):
+        predicate = lambda val: val.partition(sep)
+        result = self._apply_elementwise(predicate)
+        return type(self)(pa.chunked_array(result))
+
+    def _str_rpartition(self, sep: str, expand: bool):
+        predicate = lambda val: val.rpartition(sep)
+        result = self._apply_elementwise(predicate)
+        return type(self)(pa.chunked_array(result))
+
     def _str_contains(
         self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
     ):
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 20dc8978b63df..099e49980d27e 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -115,7 +115,9 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
     # error: Incompatible types in assignment (expression has type "StringDtype",
     # base class "ArrowExtensionArray" defined the type as "ArrowDtype")
     _dtype: StringDtype  # type: ignore[assignment]
-    _result_converter = lambda result: BooleanDtype().__from_arrow__(result)
+    _result_converter = lambda _, result, **kwargs: BooleanDtype().__from_arrow__(
+        result
+    )
     _storage = "pyarrow"
 
     def __init__(self, values) -> None:
@@ -320,7 +322,7 @@ def _str_contains(
             result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case)
         else:
             result = pc.match_substring(self._pa_array, pat, ignore_case=not case)
-        result = self._result_converter(result)
+        result = self._result_converter(result, na=na)
         if not isna(na):
             result[isna(result)] = bool(na)
         return result
@@ -329,7 +331,7 @@ def _str_startswith(self, pat: str, na=None):
         result = pc.starts_with(self._pa_array, pattern=pat)
         if not isna(na):
             result = result.fill_null(na)
-        result = self._result_converter_(result)
+        result = self._result_converter(result)
         if not isna(na):
             result[isna(result)] = bool(na)
         return result
@@ -443,84 +445,87 @@ def _str_rstrip(self, to_strip=None):
 
 
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
-    _result_converter = lambda result: result.to_numpy()
+    # _result_converter = lambda _, result: result.to_numpy(na_value=np.nan)
     _storage = "pyarrow_numpy"
 
+    @staticmethod
+    def _result_converter(values, na=None):
+        if not isna(na):
+            values = values.fill_null(bool(na))
+        return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
+
     def __getattribute__(self, item):
         if item in ArrowStringArrayMixin.__dict__:
             return partial(getattr(ArrowStringArrayMixin, item), self)
         return super().__getattribute__(item)
 
-    def _str_len(self):
-        result = pc.utf8_length(self._pa_array)
-        return result.to_numpy()
-
     def _str_map(
         self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
     ):
-        """
-        Map a callable over valid elements of the array.
-
-        Parameters
-        ----------
-        f : Callable
-            A function to call on each non-NA element.
-        na_value : Scalar, optional
-            The value to set for NA values. Might also be used for the
-            fill value if the callable `f` raises an exception.
-            This defaults to ``self._str_na_value`` which is ``np.nan``
-            for object-dtype and Categorical and ``pd.NA`` for StringArray.
-        dtype : Dtype, optional
-            The dtype of the result array.
-        convert : bool, default True
-            Whether to call `maybe_convert_objects` on the resulting ndarray
-        """
         if dtype is None:
-            dtype = np.dtype("object")
+            dtype = self.dtype
         if na_value is None:
-            na_value = self._str_na_value
-
-        if not len(self):
-            return np.array([], dtype=dtype)
-
-        arr = np.asarray(self, dtype=object)
-        mask = isna(arr)
-        map_convert = convert and not np.all(mask)
-        try:
-            result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)
-        except (TypeError, AttributeError) as err:
-            # Reraise the exception if callable `f` got wrong number of args.
-            # The user may want to be warned by this, instead of getting NaN
-            p_err = (
-                r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
-                r"(?(3)required )positional arguments?"
+            na_value = self.dtype.na_value
+
+        mask = isna(self)
+        arr = np.asarray(self)
+
+        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+            if is_integer_dtype(dtype):
+                na_value = np.nan
+            else:
+                na_value = False
+            try:
+                result = lib.map_infer_mask(
+                    arr,
+                    f,
+                    mask.view("uint8"),
+                    convert=False,
+                    na_value=na_value,
+                    dtype=np.dtype(dtype),
+                )
+                return result
+
+            except ValueError:
+                result = lib.map_infer_mask(
+                    arr,
+                    f,
+                    mask.view("uint8"),
+                    convert=False,
+                    na_value=na_value,
+                )
+                if convert and result.dtype == object:
+                    result = lib.maybe_convert_objects(result)
+                return result
+
+        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
+            # i.e. StringDtype
+            result = lib.map_infer_mask(
+                arr, f, mask.view("uint8"), convert=False, na_value=na_value
             )
+            result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True)
+            return type(self)(result)
+        else:
+            # This is when the result type is object. We reach this when
+            # -> We know the result type is truly object (e.g. .encode returns bytes
+            #    or .findall returns a list).
+            # -> We don't know the result type. E.g. `.get` can return anything.
+            return lib.map_infer_mask(arr, f, mask.view("uint8"))
 
-            if len(err.args) >= 1 and re.search(p_err, err.args[0]):
-                # FIXME: this should be totally avoidable
-                raise err
-
-            def g(x):
-                # This type of fallback behavior can be removed once
-                # we remove object-dtype .str accessor.
-                try:
-                    return f(x)
-                except (TypeError, AttributeError):
-                    return na_value
-
-            return self._str_map(g, na_value=na_value, dtype=dtype)
-        if not isinstance(result, np.ndarray):
-            return result
-        if na_value is not np.nan:
-            np.putmask(result, mask, na_value)
-            if convert and result.dtype == object:
-                result = lib.maybe_convert_objects(result)
+    def _convert_int_dtype(self, result):
+        if result.dtype == np.int32:
+            result = result.astype(np.int64)
         return result
 
     def _str_count(self, pat: str, flags: int = 0):
         if flags:
             return super()._str_count(pat, flags)
-        return pc.count_substring_regex(self._pa_array, pat).to_numpy()
+        result = pc.count_substring_regex(self._pa_array, pat).to_numpy()
+        return self._convert_int_dtype(result)
+
+    def _str_len(self):
+        result = pc.utf8_length(self._pa_array).to_numpy()
+        return self._convert_int_dtype(result)
 
     def _str_find(self, sub: str, start: int = 0, end: int | None = None):
         if start != 0 and end is not None:
@@ -534,26 +539,16 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
             result = pc.find_substring(slices, sub)
         else:
             return super()._str_find(sub, start, end)
-        return type(self)(result)
+        return self._convert_int_dtype(result.to_numpy())
 
-    def _str_split(
-        self,
-        pat: str | None = None,
-        n: int | None = -1,
-        expand: bool = False,
-        regex: bool | None = None,
-    ):
-        if n in {-1, 0}:
-            n = None
-        if regex:
-            split_func = pc.split_pattern_regex
-        else:
-            split_func = pc.split_pattern
-        return split_func(self._pa_array, pat, max_splits=n).to_numpy()
-
-    def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
-        if n in {-1, 0}:
-            n = None
-        return pc.split_pattern(
-            self._pa_array, pat, max_splits=n, reverse=True
-        ).to_numpy()
+    def _cmp_method(self, other, op):
+        result = super()._cmp_method(other, op)
+        return result.to_numpy(na_value=False)
+
+    def value_counts(self, dropna: bool = True):
+        from pandas import Series
+
+        result = super().value_counts(dropna)
+        return Series(
+            result._values.to_numpy(), index=result.index, name=result.name, copy=False
+        )
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 27e9bf8958ab0..745689ab1fcc8 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -500,7 +500,7 @@ def use_inf_as_na_cb(key) -> None:
         "string_storage",
         "python",
         string_storage_doc,
-        validator=is_one_of_factory(["python", "pyarrow"]),
+        validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]),
     )
 
 
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index e59369db776da..6d9470eb730c5 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -145,7 +145,9 @@ def _map_and_wrap(name: str | None, docstring: str | None):
     @forbid_nonstring_types(["bytes"], name=name)
     def wrapper(self):
         result = getattr(self._data.array, f"_str_{name}")()
-        return self._wrap_result(result)
+        return self._wrap_result(
+            result, returns_string=name not in ("isnumeric", "isdecimal")
+        )
 
     wrapper.__doc__ = docstring
     return wrapper
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index cfd3314eb5944..b65fc52053414 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -9,7 +9,10 @@
 
 import pandas as pd
 import pandas._testing as tm
-from pandas.core.arrays.string_arrow import ArrowStringArray
+from pandas.core.arrays.string_arrow import (
+    ArrowStringArray,
+    ArrowStringArrayNumpySemantics,
+)
 from pandas.util.version import Version
 
 
@@ -33,7 +36,12 @@ def test_repr(dtype):
     expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
     assert repr(df.A) == expected
 
-    arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray"
+    if dtype.storage == "pyarrow":
+        arr_name = "ArrowStringArray"
+    elif dtype.storage == "pyarrow_numpy":
+        arr_name = "ArrowStringArrayNumpySemantics"
+    else:
+        arr_name = "StringArray"
     expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
     assert repr(df.A.array) == expected
 
@@ -116,7 +124,7 @@ def test_add(dtype):
 
 
 def test_add_2d(dtype, request):
-    if dtype.storage == "pyarrow":
+    if dtype.storage in ("pyarrow", "pyarrow_numpy"):
         reason = "Failed: DID NOT RAISE <class 'ValueError'>"
         mark = pytest.mark.xfail(raises=None, reason=reason)
         request.node.add_marker(mark)
@@ -145,7 +153,7 @@ def test_add_sequence(dtype):
 
 
 def test_mul(dtype, request):
-    if dtype.storage == "pyarrow":
+    if dtype.storage in ("pyarrow", "pyarrow_numpy"):
         reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'"
         mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason)
         request.node.add_marker(mark)
@@ -195,19 +203,30 @@ def test_comparison_methods_scalar(comparison_op, dtype):
     a = pd.array(["a", None, "c"], dtype=dtype)
     other = "a"
     result = getattr(a, op_name)(other)
-    expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
-    expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
-    expected = pd.array(expected, dtype=expected_dtype)
-    tm.assert_extension_array_equal(result, expected)
+    if dtype.storage == "pyarrow_numpy":
+        expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
+        expected = pd.array(expected, dtype="boolean").to_numpy(na_value=False)
+        tm.assert_numpy_array_equal(result, expected)
+    else:
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
+        expected = pd.array(expected, dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
 
 
 def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
     op_name = f"__{comparison_op.__name__}__"
     a = pd.array(["a", None, "c"], dtype=dtype)
     result = getattr(a, op_name)(pd.NA)
-    expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
-    expected = pd.array([None, None, None], dtype=expected_dtype)
-    tm.assert_extension_array_equal(result, expected)
+
+    if dtype.storage == "pyarrow_numpy":
+        expected = np.array([False, False, False], dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+    else:
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = pd.array([None, None, None], dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
+        tm.assert_extension_array_equal(result, expected)
 
 
 def test_comparison_methods_scalar_not_string(comparison_op, dtype):
@@ -223,12 +242,21 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
         return
 
     result = getattr(a, op_name)(other)
-    expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[
-        op_name
-    ]
-    expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
-    expected = pd.array(expected_data, dtype=expected_dtype)
-    tm.assert_extension_array_equal(result, expected)
+
+    if dtype.storage == "pyarrow_numpy":
+        expected_data = {
+            "__eq__": [False, False, False],
+            "__ne__": [True, False, True],
+        }[op_name]
+        expected = np.array(expected_data, dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+    else:
+        expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[
+            op_name
+        ]
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = pd.array(expected_data, dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
 
 
 def test_comparison_methods_array(comparison_op, dtype):
@@ -237,15 +265,25 @@ def test_comparison_methods_array(comparison_op, dtype):
     a = pd.array(["a", None, "c"], dtype=dtype)
     other = [None, None, "c"]
     result = getattr(a, op_name)(other)
-    expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
-    expected = np.full(len(a), fill_value=None, dtype="object")
-    expected[-1] = getattr(other[-1], op_name)(a[-1])
-    expected = pd.array(expected, dtype=expected_dtype)
-    tm.assert_extension_array_equal(result, expected)
+    if dtype.storage == "pyarrow_numpy":
+        expected = np.array([False, False, False], dtype=object)
+        expected[-1] = getattr(other[-1], op_name)(a[-1])
+        tm.assert_numpy_array_equal(result, expected)
 
-    result = getattr(a, op_name)(pd.NA)
-    expected = pd.array([None, None, None], dtype=expected_dtype)
-    tm.assert_extension_array_equal(result, expected)
+        result = getattr(a, op_name)(pd.NA)
+        expected = np.array([False, False, False], dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+    else:
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = np.full(len(a), fill_value=None, dtype="object")
+        expected[-1] = getattr(other[-1], op_name)(a[-1])
+        expected = pd.array(expected, dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
+
+        result = getattr(a, op_name)(pd.NA)
+        expected = pd.array([None, None, None], dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
 
 
 def test_constructor_raises(cls):
@@ -297,7 +335,7 @@ def test_from_sequence_no_mutate(copy, cls, request):
 
     result = cls._from_sequence(nan_arr, copy=copy)
 
-    if cls is ArrowStringArray:
+    if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics):
         import pyarrow as pa
 
         expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True))
@@ -370,7 +408,7 @@ def test_min_max(method, skipna, dtype, request):
 @pytest.mark.parametrize("method", ["min", "max"])
 @pytest.mark.parametrize("box", [pd.Series, pd.array])
 def test_min_max_numpy(method, box, dtype, request):
-    if dtype.storage == "pyarrow" and box is pd.array:
+    if dtype.storage in ("pyarrow", "pyarrow_numpy") and box is pd.array:
         if box is pd.array:
             reason = "'<=' not supported between instances of 'str' and 'NoneType'"
         else:
@@ -397,7 +435,7 @@ def test_fillna_args(dtype, request):
     expected = pd.array(["a", "b"], dtype=dtype)
     tm.assert_extension_array_equal(res, expected)
 
-    if dtype.storage == "pyarrow":
+    if dtype.storage in ("pyarrow", "pyarrow_numpy"):
         msg = "Invalid value '1' for dtype string"
     else:
         msg = "Cannot set non-string value '1' into a StringArray."
@@ -455,6 +493,8 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2):
 def test_value_counts_na(dtype):
     if getattr(dtype, "storage", "") == "pyarrow":
         exp_dtype = "int64[pyarrow]"
+    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+        exp_dtype = "int64"
     else:
         exp_dtype = "Int64"
     arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
@@ -470,6 +510,8 @@ def test_value_counts_na(dtype):
 def test_value_counts_with_normalize(dtype):
     if getattr(dtype, "storage", "") == "pyarrow":
         exp_dtype = "double[pyarrow]"
+    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+        exp_dtype = np.float64
     else:
         exp_dtype = "Float64"
     ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
@@ -506,7 +548,7 @@ def test_use_inf_as_na(values, expected, dtype):
 def test_memory_usage(dtype):
     # GH 33963
 
-    if dtype.storage == "pyarrow":
+    if dtype.storage in ("pyarrow", "pyarrow_numpy"):
         pytest.skip(f"not applicable for {dtype.storage}")
 
     series = pd.Series(["a", "b", "c"], dtype=dtype)
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index 9eee2e0bea687..20530e37116f2 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -324,7 +324,12 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage):
         ):
             arr.searchsorted("foo")
 
-        arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray"
+        if string_storage == "python":
+            arr_type = "StringArray"
+        elif string_storage == "pyarrow":
+            arr_type = "ArrowStringArray"
+        else:
+            arr_type = "ArrowStringArrayNumpySemantics"
 
         with pd.option_context("string_storage", string_storage):
             with pytest.raises(
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index 2dd62a4ca7538..16059155a7a8f 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -70,6 +70,9 @@ def test_value_counts_with_normalize(self, data):
         ):
             # TODO: avoid special-casing
             expected = expected.astype("double[pyarrow]")
+        elif getattr(data.dtype, "storage", "") == "pyarrow_numpy":
+            # TODO: avoid special-casing
+            expected = expected.astype("float64")
         elif na_value_for_dtype(data.dtype) is pd.NA:
             # TODO(GH#44692): avoid special-casing
             expected = expected.astype("Float64")
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 6597ff84e3ca4..6088cd211d829 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -104,7 +104,7 @@ def test_is_not_string_type(self, dtype):
 
 class TestInterface(base.BaseInterfaceTests):
     def test_view(self, data, request):
-        if data.dtype.storage == "pyarrow":
+        if data.dtype.storage in ("pyarrow", "pyarrow_numpy"):
             pytest.skip(reason="2D support not implemented for ArrowStringArray")
         super().test_view(data)
 
@@ -117,7 +117,7 @@ def test_from_dtype(self, data):
 
 class TestReshaping(base.BaseReshapingTests):
     def test_transpose(self, data, request):
-        if data.dtype.storage == "pyarrow":
+        if data.dtype.storage in ("pyarrow", "pyarrow_numpy"):
             pytest.skip(reason="2D support not implemented for ArrowStringArray")
         super().test_transpose(data)
 
@@ -128,7 +128,7 @@ class TestGetitem(base.BaseGetitemTests):
 
 class TestSetitem(base.BaseSetitemTests):
     def test_setitem_preserves_views(self, data, request):
-        if data.dtype.storage == "pyarrow":
+        if data.dtype.storage in ("pyarrow", "pyarrow_numpy"):
             pytest.skip(reason="2D support not implemented for ArrowStringArray")
         super().test_setitem_preserves_views(data)
 
@@ -184,6 +184,8 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
         # attribute "storage"
         if dtype.storage == "pyarrow":  # type: ignore[union-attr]
             cast_to = "boolean[pyarrow]"
+        elif dtype.storage == "pyarrow_numpy":
+            cast_to = np.bool_
         else:
             cast_to = "boolean"
         return pointwise_result.astype(cast_to)
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
index 170e2f61e7d4a..701bfe3767db4 100644
--- a/pandas/tests/io/conftest.py
+++ b/pandas/tests/io/conftest.py
@@ -234,3 +234,19 @@ def compression_format(request):
 @pytest.fixture(params=_compression_formats_params)
 def compression_ext(request):
     return request.param[0]
+
+
+@pytest.fixture(
+    params=[
+        "python",
+        pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
+    ]
+)
+def string_storage(request):
+    """
+    Parametrized fixture for pd.options.mode.string_storage.
+
+    * 'python'
+    * 'pyarrow'
+    """
+    return request.param
diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py
index e69de29bb2d1d..326ae24410502 100644
--- a/pandas/tests/strings/__init__.py
+++ b/pandas/tests/strings/__init__.py
@@ -0,0 +1 @@
+object_pyarrow_numpy = ("object", "string[pyarrow_numpy]")
diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py
index ced941187f548..3de97cccd1d72 100644
--- a/pandas/tests/strings/test_case_justify.py
+++ b/pandas/tests/strings/test_case_justify.py
@@ -278,6 +278,8 @@ def test_center_ljust_rjust_mixed_object():
 
 
 def test_center_ljust_rjust_fillchar(any_string_dtype):
+    if any_string_dtype == "string[pyarrow_numpy]":
+        pytest.skip("Arrow logic is different")
     s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype)
 
     result = s.str.center(5, fillchar="X")
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index c3cc8b3643ed2..f62299f53aebe 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -11,6 +11,7 @@
     Series,
     _testing as tm,
 )
+from pandas.tests.strings import object_pyarrow_numpy
 
 # --------------------------------------------------------------------------------------
 # str.contains
@@ -25,7 +26,7 @@ def test_contains(any_string_dtype):
     pat = "mmm[_]+"
 
     result = values.str.contains(pat)
-    expected_dtype = "object" if any_string_dtype == "object" else "boolean"
+    expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series(
         np.array([False, np.nan, True, True, False], dtype=np.object_),
         dtype=expected_dtype,
@@ -44,7 +45,7 @@ def test_contains(any_string_dtype):
         dtype=any_string_dtype,
     )
     result = values.str.contains(pat)
-    expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
+    expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -71,14 +72,14 @@ def test_contains(any_string_dtype):
     pat = "mmm[_]+"
 
     result = values.str.contains(pat)
-    expected_dtype = "object" if any_string_dtype == "object" else "boolean"
+    expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series(
         np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype
     )
     tm.assert_series_equal(result, expected)
 
     result = values.str.contains(pat, na=False)
-    expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
+    expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -163,7 +164,7 @@ def test_contains_moar(any_string_dtype):
     )
 
     result = s.str.contains("a")
-    expected_dtype = "object" if any_string_dtype == "object" else "boolean"
+    expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series(
         [False, False, False, True, True, False, np.nan, False, False, True],
         dtype=expected_dtype,
@@ -204,7 +205,7 @@ def test_contains_nan(any_string_dtype):
     s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype)
 
     result = s.str.contains("foo", na=False)
-    expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
+    expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series([False, False, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -215,12 +216,14 @@ def test_contains_nan(any_string_dtype):
     result = s.str.contains("foo", na="foo")
     if any_string_dtype == "object":
         expected = Series(["foo", "foo", "foo"], dtype=np.object_)
+    elif any_string_dtype == "string[pyarrow_numpy]":
+        expected = Series([True, True, True], dtype=np.bool_)
     else:
         expected = Series([True, True, True], dtype="boolean")
     tm.assert_series_equal(result, expected)
 
     result = s.str.contains("foo")
-    expected_dtype = "object" if any_string_dtype == "object" else "boolean"
+    expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -379,7 +382,7 @@ def test_replace_unicode(any_string_dtype):
     ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
     expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
     tm.assert_series_equal(result, expected)
@@ -402,7 +405,7 @@ def test_replace_callable(any_string_dtype):
     # test with callable
     repl = lambda m: m.group(0).swapcase()
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
     expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
@@ -423,7 +426,7 @@ def test_replace_callable_raises(any_string_dtype, repl):
     )
     with pytest.raises(TypeError, match=msg):
         with tm.maybe_produces_warning(
-            PerformanceWarning, any_string_dtype == "string[pyarrow]"
+            PerformanceWarning, "string[pyarrow" in any_string_dtype
         ):
             values.str.replace("a", repl, regex=True)
 
@@ -434,7 +437,7 @@ def test_replace_callable_named_groups(any_string_dtype):
     pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
     repl = lambda m: m.group("middle").swapcase()
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.replace(pat, repl, regex=True)
     expected = Series(["bAR", np.nan], dtype=any_string_dtype)
@@ -448,14 +451,14 @@ def test_replace_compiled_regex(any_string_dtype):
     # test with compiled regex
     pat = re.compile(r"BAD_*")
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.replace(pat, "", regex=True)
     expected = Series(["foobar", np.nan], dtype=any_string_dtype)
     tm.assert_series_equal(result, expected)
 
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.replace(pat, "", n=1, regex=True)
     expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
@@ -477,7 +480,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype):
     expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
     pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.replace(pat, ", ", regex=True)
     tm.assert_series_equal(result, expected)
@@ -507,7 +510,7 @@ def test_replace_compiled_regex_callable(any_string_dtype):
     repl = lambda m: m.group(0).swapcase()
     pat = re.compile("[a-z][A-Z]{2}")
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.replace(pat, repl, n=2, regex=True)
     expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
@@ -558,7 +561,7 @@ def test_replace_moar(any_string_dtype):
     tm.assert_series_equal(result, expected)
 
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.replace("A", "YYY", case=False)
     expected = Series(
@@ -579,7 +582,7 @@ def test_replace_moar(any_string_dtype):
     tm.assert_series_equal(result, expected)
 
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
     expected = Series(
@@ -605,14 +608,14 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype):
     ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype)
 
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.replace("a", "c", case=False, regex=False)
     expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype)
     tm.assert_series_equal(result, expected)
 
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.replace("a.", "c.", case=False, regex=False)
     expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype)
@@ -648,7 +651,7 @@ def test_replace_regex_single_character(regex, any_string_dtype):
 
 def test_match(any_string_dtype):
     # New match behavior introduced in 0.13
-    expected_dtype = "object" if any_string_dtype == "object" else "boolean"
+    expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
 
     values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
     result = values.str.match(".*(BAD[_]+).*(BAD)")
@@ -703,12 +706,12 @@ def test_match_na_kwarg(any_string_dtype):
     s = Series(["a", "b", np.nan], dtype=any_string_dtype)
 
     result = s.str.match("a", na=False)
-    expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
+    expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series([True, False, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
     result = s.str.match("a")
-    expected_dtype = "object" if any_string_dtype == "object" else "boolean"
+    expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series([True, False, np.nan], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -716,7 +719,7 @@ def test_match_na_kwarg(any_string_dtype):
 def test_match_case_kwarg(any_string_dtype):
     values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
     result = values.str.match("ab", case=False)
-    expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
+    expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series([True, True, True, True], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -732,7 +735,7 @@ def test_fullmatch(any_string_dtype):
         ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
     )
     result = ser.str.fullmatch(".*BAD[_]+.*BAD")
-    expected_dtype = "object" if any_string_dtype == "object" else "boolean"
+    expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series([True, False, np.nan, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -742,14 +745,14 @@ def test_fullmatch_na_kwarg(any_string_dtype):
         ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
     )
     result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False)
-    expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
+    expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series([True, False, False, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
 
 def test_fullmatch_case_kwarg(any_string_dtype):
     ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
-    expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
+    expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
 
     expected = Series([True, False, False, False], dtype=expected_dtype)
 
@@ -762,7 +765,7 @@ def test_fullmatch_case_kwarg(any_string_dtype):
     tm.assert_series_equal(result, expected)
 
     with tm.maybe_produces_warning(
-        PerformanceWarning, any_string_dtype == "string[pyarrow]"
+        PerformanceWarning, "string[pyarrow" in any_string_dtype
     ):
         result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
     tm.assert_series_equal(result, expected)
@@ -825,7 +828,7 @@ def test_find(any_string_dtype):
     ser = Series(
         ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype
     )
-    expected_dtype = np.int64 if any_string_dtype == "object" else "Int64"
+    expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64"
 
     result = ser.str.find("EF")
     expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype)
@@ -877,7 +880,7 @@ def test_find_nan(any_string_dtype):
     ser = Series(
         ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype
     )
-    expected_dtype = np.float64 if any_string_dtype == "object" else "Int64"
+    expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64"
 
     result = ser.str.find("EF")
     expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype)
@@ -944,7 +947,7 @@ def test_flags_kwarg(any_string_dtype):
 
     pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
 
-    using_pyarrow = any_string_dtype == "string[pyarrow]"
+    using_pyarrow = "string[pyarrow" in any_string_dtype
 
     result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
     assert result.iloc[0].tolist() == ["dave", "google", "com"]
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index 1e573bdfe8fb5..bdae496cdc121 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -14,6 +14,7 @@
 )
 import pandas._testing as tm
 from pandas.core.strings.accessor import StringMethods
+from pandas.tests.strings import object_pyarrow_numpy
 
 
 @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])])
@@ -40,7 +41,7 @@ def test_iter_raises():
 def test_count(any_string_dtype):
     ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype)
     result = ser.str.count("f[o]+")
-    expected_dtype = np.float64 if any_string_dtype == "object" else "Int64"
+    expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64"
     expected = Series([1, 2, np.nan, 4], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -91,7 +92,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat):
 
 def test_empty_str_methods(any_string_dtype):
     empty_str = empty = Series(dtype=any_string_dtype)
-    if any_string_dtype == "object":
+    if any_string_dtype in object_pyarrow_numpy:
         empty_int = Series(dtype="int64")
         empty_bool = Series(dtype=bool)
     else:
@@ -205,7 +206,7 @@ def test_ismethods(method, expected, any_string_dtype):
     ser = Series(
         ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", "  "], dtype=any_string_dtype
     )
-    expected_dtype = "bool" if any_string_dtype == "object" else "boolean"
+    expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series(expected, dtype=expected_dtype)
     result = getattr(ser.str, method)()
     tm.assert_series_equal(result, expected)
@@ -230,7 +231,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype):
     ser = Series(
         ["A", "3", "¼", "★", "፸", "３", "four"], dtype=any_string_dtype  # noqa: RUF001
     )
-    expected_dtype = "bool" if any_string_dtype == "object" else "boolean"
+    expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series(expected, dtype=expected_dtype)
     result = getattr(ser.str, method)()
     tm.assert_series_equal(result, expected)
@@ -250,7 +251,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype):
 def test_isnumeric_unicode_missing(method, expected, any_string_dtype):
     values = ["A", np.nan, "¼", "★", np.nan, "３", "four"]  # noqa: RUF001
     ser = Series(values, dtype=any_string_dtype)
-    expected_dtype = "object" if any_string_dtype == "object" else "boolean"
+    expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
     expected = Series(expected, dtype=expected_dtype)
     result = getattr(ser.str, method)()
     tm.assert_series_equal(result, expected)
@@ -280,7 +281,7 @@ def test_len(any_string_dtype):
         dtype=any_string_dtype,
     )
     result = ser.str.len()
-    expected_dtype = "float64" if any_string_dtype == "object" else "Int64"
+    expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64"
     expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -309,7 +310,8 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec
     obj = index_or_series(
         ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype
     )
-    expected_dtype = np.int64 if any_string_dtype == "object" else "Int64"
+
+    expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64"
     expected = index_or_series(expected, dtype=expected_dtype)
 
     result = getattr(obj.str, method)(sub, start, end)
@@ -350,7 +352,7 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method):
 )
 def test_index_missing(any_string_dtype, method, exp):
     ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype)
-    expected_dtype = np.float64 if any_string_dtype == "object" else "Int64"
+    expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64"
 
     result = getattr(ser.str, method)("b")
     expected = Series(exp + [np.nan], dtype=expected_dtype)

From d9e61e542d9252928d8119f65894381f749de0c6 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 13 Aug 2023 22:00:05 +0200
Subject: [PATCH 04/23] Move methods

---
 pandas/core/arrays/arrow/array.py | 42 +++++++++++++++----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index d4c68c6870681..b2b1b92e569e3 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2000,27 +2000,6 @@ def _str_count(self, pat: str, flags: int = 0):
             raise NotImplementedError(f"count not implemented with {flags=}")
         return type(self)(pc.count_substring_regex(self._pa_array, pat))
 
-    def _str_slice(
-        self, start: int | None = None, stop: int | None = None, step: int | None = None
-    ):
-        if start is None:
-            start = 0
-        if step is None:
-            step = 1
-        return type(self)(
-            pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
-        )
-
-    def _str_partition(self, sep: str, expand: bool):
-        predicate = lambda val: val.partition(sep)
-        result = self._apply_elementwise(predicate)
-        return type(self)(pa.chunked_array(result))
-
-    def _str_rpartition(self, sep: str, expand: bool):
-        predicate = lambda val: val.rpartition(sep)
-        result = self._apply_elementwise(predicate)
-        return type(self)(pa.chunked_array(result))
-
     def _str_contains(
         self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
     ):
@@ -2113,6 +2092,27 @@ def _str_join(self, sep: str):
             result = self._pa_array
         return type(self)(pc.binary_join(result, sep))
 
+    def _str_partition(self, sep: str, expand: bool):
+        predicate = lambda val: val.partition(sep)
+        result = self._apply_elementwise(predicate)
+        return type(self)(pa.chunked_array(result))
+
+    def _str_rpartition(self, sep: str, expand: bool):
+        predicate = lambda val: val.rpartition(sep)
+        result = self._apply_elementwise(predicate)
+        return type(self)(pa.chunked_array(result))
+
+    def _str_slice(
+        self, start: int | None = None, stop: int | None = None, step: int | None = None
+    ):
+        if start is None:
+            start = 0
+        if step is None:
+            step = 1
+        return type(self)(
+            pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
+        )
+
     def _str_isalnum(self):
         return type(self)(pc.utf8_is_alnum(self._pa_array))
 

From 3188c25cc74859694e7991e54f9a4f1333e2d71d Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 13 Aug 2023 22:58:17 +0200
Subject: [PATCH 05/23] Refactor

---
 pandas/core/arrays/string_arrow.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 099e49980d27e..aa9deae0b1964 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -115,9 +115,6 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
     # error: Incompatible types in assignment (expression has type "StringDtype",
     # base class "ArrowExtensionArray" defined the type as "ArrowDtype")
     _dtype: StringDtype  # type: ignore[assignment]
-    _result_converter = lambda _, result, **kwargs: BooleanDtype().__from_arrow__(
-        result
-    )
     _storage = "pyarrow"
 
     def __init__(self, values) -> None:
@@ -187,6 +184,10 @@ def insert(self, loc: int, item) -> ArrowStringArray:
             raise TypeError("Scalar must be NA or str")
         return super().insert(loc, item)
 
+    @staticmethod
+    def _result_converter(values, **kwargs):
+        return BooleanDtype().__from_arrow__(values)
+
     def _maybe_convert_setitem_value(self, value):
         """Maybe convert value to be pyarrow compatible."""
         if is_scalar(value):

From cd19bfb43d077e379b12b80d0c30e51724b6f5ec Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Mon, 14 Aug 2023 12:16:31 +0200
Subject: [PATCH 06/23] Refactor

---
 pandas/tests/strings/test_strings.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index bdae496cdc121..4315835b70a40 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -310,7 +310,6 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec
     obj = index_or_series(
         ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype
     )
-
     expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64"
     expected = index_or_series(expected, dtype=expected_dtype)
 

From c73c6b0e964cf1694feefe2a2c390f7e3dd5f0b8 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Mon, 14 Aug 2023 12:19:19 +0200
Subject: [PATCH 07/23] Remove

---
 pandas/core/arrays/arrow/array.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 1ff3b31389bb2..dea502c6dc090 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -250,12 +250,6 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
             )
         self._dtype = ArrowDtype(self._pa_array.type)
 
-    def __dir__(self):
-        o = set(dir(type(self)))
-        o.update(self.__dict__)
-        o.update(set(dir(ArrowStringArrayMixin)))
-        return list(o)
-
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
         """

From 6b2630902c93a0ea49a883fc3bbd76d5360db364 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Mon, 14 Aug 2023 13:46:47 +0200
Subject: [PATCH 08/23] Fix

---
 pandas/core/arrays/string_arrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index aa9deae0b1964..bc9add038301d 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -456,7 +456,7 @@ def _result_converter(values, na=None):
         return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
 
     def __getattribute__(self, item):
-        if item in ArrowStringArrayMixin.__dict__:
+        if item in ArrowStringArrayMixin.__dict__ and item != "_pa_array":
             return partial(getattr(ArrowStringArrayMixin, item), self)
         return super().__getattribute__(item)
 

From da6d67c7447a6c51262c6c4a556684ca41fc08c8 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 14 Aug 2023 17:21:06 +0200
Subject: [PATCH 09/23] Update

---
 pandas/tests/arrays/string_/test_string.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index b65fc52053414..3d66d45151e05 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -5,6 +5,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat.pyarrow import pa_version_under12p0
+
 from pandas.core.dtypes.common import is_dtype_equal
 
 import pandas as pd
@@ -13,7 +15,6 @@
     ArrowStringArray,
     ArrowStringArrayNumpySemantics,
 )
-from pandas.util.version import Version
 
 
 @pytest.fixture
@@ -450,7 +451,7 @@ def test_arrow_array(dtype):
     data = pd.array(["a", "b", "c"], dtype=dtype)
     arr = pa.array(data)
     expected = pa.array(list(data), type=pa.string(), from_pandas=True)
-    if dtype.storage == "pyarrow" and Version(pa.__version__) <= Version("11.0.0"):
+    if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
         expected = pa.chunked_array(expected)
 
     assert arr.equals(expected)

From d862ecaf3aa05082d36512f994d99106cc7c10b1 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Wed, 16 Aug 2023 18:53:57 +0200
Subject: [PATCH 10/23] Na return value

---
 pandas/core/arrays/string_.py                |  5 ++-
 pandas/tests/strings/__init__.py             | 14 +++++++
 pandas/tests/strings/test_find_replace.py    |  9 ++--
 pandas/tests/strings/test_split_partition.py | 44 +++++++-------------
 4 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 1e285f90e9fea..4c28360c732a3 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -101,7 +101,10 @@ class StringDtype(StorageExtensionDtype):
     #: StringDtype().na_value uses pandas.NA
     @property
     def na_value(self) -> libmissing.NAType:
-        return libmissing.NA
+        if self.storage == "pyarrow_numpy":
+            return np.nan
+        else:
+            return libmissing.NA
 
     _metadata = ("storage",)
 
diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py
index 326ae24410502..01b49b5e5b633 100644
--- a/pandas/tests/strings/__init__.py
+++ b/pandas/tests/strings/__init__.py
@@ -1 +1,15 @@
+import numpy as np
+
+import pandas as pd
+
 object_pyarrow_numpy = ("object", "string[pyarrow_numpy]")
+
+
+def _convert_na_value(ser, expected):
+    if ser.dtype != object:
+        if ser.dtype.storage == "pyarrow_numpy":
+            expected = expected.fillna(np.nan)
+        else:
+            # GH#18463
+            expected = expected.fillna(pd.NA)
+    return expected
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index f62299f53aebe..2320ab4ed8b02 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -11,7 +11,10 @@
     Series,
     _testing as tm,
 )
-from pandas.tests.strings import object_pyarrow_numpy
+from pandas.tests.strings import (
+    _convert_na_value,
+    object_pyarrow_numpy,
+)
 
 # --------------------------------------------------------------------------------------
 # str.contains
@@ -780,9 +783,7 @@ def test_findall(any_string_dtype):
     ser = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"], dtype=any_string_dtype)
     result = ser.str.findall("BAD[_]*")
     expected = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]])
-    if ser.dtype != object:
-        # GH#18463
-        expected = expected.fillna(pd.NA)
+    expected = _convert_na_value(ser, expected)
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
index 0298694ccaf71..0a7d409773dd6 100644
--- a/pandas/tests/strings/test_split_partition.py
+++ b/pandas/tests/strings/test_split_partition.py
@@ -12,6 +12,10 @@
     Series,
     _testing as tm,
 )
+from pandas.tests.strings import (
+    _convert_na_value,
+    object_pyarrow_numpy,
+)
 
 
 @pytest.mark.parametrize("method", ["split", "rsplit"])
@@ -20,9 +24,7 @@ def test_split(any_string_dtype, method):
 
     result = getattr(values.str, method)("_")
     exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
-    if values.dtype != object:
-        # GH#18463
-        exp = exp.fillna(pd.NA)
+    exp = _convert_na_value(values, exp)
     tm.assert_series_equal(result, exp)
 
 
@@ -32,9 +34,7 @@ def test_split_more_than_one_char(any_string_dtype, method):
     values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype)
     result = getattr(values.str, method)("__")
     exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
-    if values.dtype != object:
-        # GH#18463
-        exp = exp.fillna(pd.NA)
+    exp = _convert_na_value(values, exp)
     tm.assert_series_equal(result, exp)
 
     result = getattr(values.str, method)("__", expand=False)
@@ -46,9 +46,7 @@ def test_split_more_regex_split(any_string_dtype):
     values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
     result = values.str.split("[,_]")
     exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
-    if values.dtype != object:
-        # GH#18463
-        exp = exp.fillna(pd.NA)
+    exp = _convert_na_value(values, exp)
     tm.assert_series_equal(result, exp)
 
 
@@ -118,8 +116,8 @@ def test_split_object_mixed(expand, method):
 def test_split_n(any_string_dtype, method, n):
     s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype)
     expected = Series([["a", "b"], pd.NA, ["b", "c"]])
-
     result = getattr(s.str, method)(" ", n=n)
+    expected = _convert_na_value(s, expected)
     tm.assert_series_equal(result, expected)
 
 
@@ -128,9 +126,7 @@ def test_rsplit(any_string_dtype):
     values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
     result = values.str.rsplit("[,_]")
     exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]])
-    if values.dtype != object:
-        # GH#18463
-        exp = exp.fillna(pd.NA)
+    exp = _convert_na_value(values, exp)
     tm.assert_series_equal(result, exp)
 
 
@@ -139,9 +135,7 @@ def test_rsplit_max_number(any_string_dtype):
     values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
     result = values.str.rsplit("_", n=1)
     exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]])
-    if values.dtype != object:
-        # GH#18463
-        exp = exp.fillna(pd.NA)
+    exp = _convert_na_value(values, exp)
     tm.assert_series_equal(result, exp)
 
 
@@ -390,7 +384,7 @@ def test_split_nan_expand(any_string_dtype):
     # check that these are actually np.nan/pd.NA and not None
     # TODO see GH 18463
     # tm.assert_frame_equal does not differentiate
-    if any_string_dtype == "object":
+    if any_string_dtype in object_pyarrow_numpy:
         assert all(np.isnan(x) for x in result.iloc[1])
     else:
         assert all(x is pd.NA for x in result.iloc[1])
@@ -455,9 +449,7 @@ def test_partition_series_more_than_one_char(method, exp, any_string_dtype):
     s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype)
     result = getattr(s.str, method)("__", expand=False)
     expected = Series(exp)
-    if s.dtype != object:
-        # GH#18463
-        expected = expected.fillna(pd.NA)
+    expected = _convert_na_value(s, expected)
     tm.assert_series_equal(result, expected)
 
 
@@ -480,9 +472,7 @@ def test_partition_series_none(any_string_dtype, method, exp):
     s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype)
     result = getattr(s.str, method)(expand=False)
     expected = Series(exp)
-    if s.dtype != object:
-        # GH#18463
-        expected = expected.fillna(pd.NA)
+    expected = _convert_na_value(s, expected)
     tm.assert_series_equal(result, expected)
 
 
@@ -505,9 +495,7 @@ def test_partition_series_not_split(any_string_dtype, method, exp):
     s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype)
     result = getattr(s.str, method)("_", expand=False)
     expected = Series(exp)
-    if s.dtype != object:
-        # GH#18463
-        expected = expected.fillna(pd.NA)
+    expected = _convert_na_value(s, expected)
     tm.assert_series_equal(result, expected)
 
 
@@ -531,9 +519,7 @@ def test_partition_series_unicode(any_string_dtype, method, exp):
 
     result = getattr(s.str, method)("_", expand=False)
     expected = Series(exp)
-    if s.dtype != object:
-        # GH#18463
-        expected = expected.fillna(pd.NA)
+    expected = _convert_na_value(s, expected)
     tm.assert_series_equal(result, expected)
 
 

From 6cf263912ff682ac688a945fc27db454a6851f5a Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Wed, 16 Aug 2023 22:40:39 +0200
Subject: [PATCH 11/23] Fix

---
 pandas/tests/strings/test_find_replace.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index e2d4619290a93..4cbfe97ad2ab4 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -19,7 +19,7 @@
 
 
 def using_pyarrow(dtype):
-    return dtype in ("string[pyarrow]",)
+    return dtype in ("string[pyarrow]", "string[pyarrow_numpy]")
 
 
 def test_contains(any_string_dtype):

From 48bd626c8bcc9df0c13dca08861c0b21be0bfcca Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Wed, 16 Aug 2023 23:10:27 +0200
Subject: [PATCH 12/23] Update

---
 pandas/tests/arrays/string_/test_string.py | 33 ++++++++++++++++------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 3d66d45151e05..d92f1c048a369 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -17,6 +17,13 @@
 )
 
 
+def na_val(dtype):
+    if dtype.storage == "pyarrow_numpy":
+        return np.nan
+    else:
+        return pd.NA
+
+
 @pytest.fixture
 def dtype(string_storage):
     """Fixture giving StringDtype from parametrized 'string_storage'"""
@@ -31,26 +38,34 @@ def cls(dtype):
 
 def test_repr(dtype):
     df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)})
-    expected = "      A\n0     a\n1  <NA>\n2     b"
+    if dtype.storage == "pyarrow_numpy":
+        expected = "     A\n0    a\n1  NaN\n2    b"
+    else:
+        expected = "      A\n0     a\n1  <NA>\n2     b"
     assert repr(df) == expected
 
-    expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
+    if dtype.storage == "pyarrow_numpy":
+        expected = "0      a\n1    NaN\n2      b\nName: A, dtype: string"
+    else:
+        expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
     assert repr(df.A) == expected
 
     if dtype.storage == "pyarrow":
         arr_name = "ArrowStringArray"
+        expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
     elif dtype.storage == "pyarrow_numpy":
         arr_name = "ArrowStringArrayNumpySemantics"
+        expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
     else:
         arr_name = "StringArray"
-    expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
+        expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
     assert repr(df.A.array) == expected
 
 
 def test_none_to_nan(cls):
     a = cls._from_sequence(["a", None, "b"])
     assert a[1] is not None
-    assert a[1] is pd.NA
+    assert a[1] is na_val(a.dtype)
 
 
 def test_setitem_validates(cls):
@@ -206,7 +221,7 @@ def test_comparison_methods_scalar(comparison_op, dtype):
     result = getattr(a, op_name)(other)
     if dtype.storage == "pyarrow_numpy":
         expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
-        expected = pd.array(expected, dtype="boolean").to_numpy(na_value=False)
+        expected[1] = False
         tm.assert_numpy_array_equal(result, expected)
     else:
         expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
@@ -403,7 +418,7 @@ def test_min_max(method, skipna, dtype, request):
         expected = "a" if method == "min" else "c"
         assert result == expected
     else:
-        assert result is pd.NA
+        assert result is na_val(arr.dtype)
 
 
 @pytest.mark.parametrize("method", ["min", "max"])
@@ -471,7 +486,7 @@ def test_arrow_roundtrip(dtype, string_storage2):
     expected = df.astype(f"string[{string_storage2}]")
     tm.assert_frame_equal(result, expected)
     # ensure the missing value is represented by NA and not np.nan or None
-    assert result.loc[2, "a"] is pd.NA
+    assert result.loc[2, "a"] is na_val(result["a"].dtype)
 
 
 def test_arrow_load_from_zero_chunks(dtype, string_storage2):
@@ -569,7 +584,7 @@ def test_astype_from_float_dtype(float_dtype, dtype):
 def test_to_numpy_returns_pdna_default(dtype):
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
     result = np.array(arr)
-    expected = np.array(["a", pd.NA, "b"], dtype=object)
+    expected = np.array(["a", na_val(dtype), "b"], dtype=object)
     tm.assert_numpy_array_equal(result, expected)
 
 
@@ -609,7 +624,7 @@ def test_setitem_scalar_with_mask_validation(dtype):
     mask = np.array([False, True, False])
 
     ser[mask] = None
-    assert ser.array[1] is pd.NA
+    assert ser.array[1] is na_val(ser.dtype)
 
     # for other non-string we should also raise an error
     ser = pd.Series(["a", "b", "c"], dtype=dtype)

From 4f9387a369fc3e894b96c11fdb2d3d487ec0d6ae Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Thu, 17 Aug 2023 00:32:43 +0200
Subject: [PATCH 13/23] Implement any and all for pyarrow numpy strings

---
 pandas/core/arrays/string_arrow.py      | 13 +++++++++++++
 pandas/tests/series/test_logical_ops.py | 18 ++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 2f9b2651103d9..b75b94008fe4f 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -553,3 +553,16 @@ def value_counts(self, dropna: bool = True):
         return Series(
             result._values.to_numpy(), index=result.index, name=result.name, copy=False
         )
+
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
+        if name in ["any", "all"]:
+            arr = pc.and_kleene(
+                pc.invert(pc.is_null(self._pa_array)), pc.not_equal(self._pa_array, "")
+            )
+            return ArrowExtensionArray(arr)._reduce(
+                name, skipna=skipna, keepdims=keepdims, **kwargs
+            )
+        else:
+            return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py
index 4dab3e8f62598..fa2e37ccc0cac 100644
--- a/pandas/tests/series/test_logical_ops.py
+++ b/pandas/tests/series/test_logical_ops.py
@@ -513,3 +513,21 @@ def test_int_dtype_different_index_not_bool(self):
 
         result = ser1 ^ ser2
         tm.assert_series_equal(result, expected)
+
+    def test_any_all(self):
+        # GH#54591
+        ser = Series(["", "a"], dtype="string[pyarrow_numpy]")
+        assert ser.any()
+        assert not ser.all()
+
+        ser = Series([None, "a"], dtype="string[pyarrow_numpy]")
+        assert ser.any()
+        assert not ser.all()
+
+        ser = Series([None, ""], dtype="string[pyarrow_numpy]")
+        assert not ser.any()
+        assert not ser.all()
+
+        ser = Series(["a", "b"], dtype="string[pyarrow_numpy]")
+        assert ser.any()
+        assert ser.all()

From 606cd71c4adbef4ca03dbb86ca884775d595f0c5 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 21 Aug 2023 11:15:25 +0200
Subject: [PATCH 14/23] Fix typing

---
 pandas/core/arrays/string_.py         | 2 +-
 pandas/core/arrays/string_arrow.py    | 2 +-
 pandas/tests/extension/test_string.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 16cc18320fb65..79e5beb3175dc 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -100,7 +100,7 @@ class StringDtype(StorageExtensionDtype):
 
     #: StringDtype().na_value uses pandas.NA
     @property
-    def na_value(self) -> libmissing.NAType:
+    def na_value(self) -> libmissing.NAType | float:
         if self.storage == "pyarrow_numpy":
             return np.nan
         else:
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 2f9b2651103d9..30d8d22ef7f5a 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -483,7 +483,7 @@ def _str_map(
                     mask.view("uint8"),
                     convert=False,
                     na_value=na_value,
-                    dtype=np.dtype(dtype),
+                    dtype=np.dtype(dtype),  # type: ignore[arg-type]
                 )
                 return result
 
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 6088cd211d829..35071412a6e40 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -184,8 +184,8 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
         # attribute "storage"
         if dtype.storage == "pyarrow":  # type: ignore[union-attr]
             cast_to = "boolean[pyarrow]"
-        elif dtype.storage == "pyarrow_numpy":
-            cast_to = np.bool_
+        elif dtype.storage == "pyarrow_numpy":  # type: ignore[union-attr]
+            cast_to = np.bool_  # type: ignore[assignment]
         else:
             cast_to = "boolean"
         return pointwise_result.astype(cast_to)

From 64145012462c59a805602601f082445a1734088a Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 21 Aug 2023 11:25:05 +0200
Subject: [PATCH 15/23] Update

---
 pandas/tests/extension/test_string.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 6088cd211d829..cde08c5372b91 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -168,6 +168,17 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
         with pytest.raises(TypeError):
             getattr(ser, op_name)(skipna=skipna)
 
+    def check_reduce(self, s, op_name, skipna):
+        res_op = getattr(s, op_name)
+        alt = s.astype("object")
+        exp_op = getattr(alt, op_name)
+        result = res_op(skipna=skipna)
+        expected = exp_op(skipna=skipna)
+        tm.assert_almost_equal(result, expected)
+
+    def _supports_reduction(self, obj, op_name: str) -> bool:
+        return obj.dtype.storage == "pyarrow_numpy"
+
 
 class TestMethods(base.BaseMethodsTests):
     pass

From 68acc329a9dd8662655ca86cca66e83adcb7006b Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 21 Aug 2023 12:08:46 +0200
Subject: [PATCH 16/23] Fix

---
 pandas/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index d0e0fca40de57..767855f13e16d 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -2002,4 +2002,4 @@ def warsaw(request) -> str:
 
 @pytest.fixture()
 def arrow_string_storage():
-    return ("pyarrow",)
+    return ("pyarrow", "pyarrow_numpy")

From fbab6fbfa866046d4a4d98b4ff93bdd7aaa7fab5 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 21 Aug 2023 12:08:59 +0200
Subject: [PATCH 17/23] Fix

---
 pandas/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index d0e0fca40de57..767855f13e16d 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -2002,4 +2002,4 @@ def warsaw(request) -> str:
 
 @pytest.fixture()
 def arrow_string_storage():
-    return ("pyarrow",)
+    return ("pyarrow", "pyarrow_numpy")

From 68e5f8f2fb310c549f2ab62a5342b25669d369bd Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 21 Aug 2023 13:24:15 +0200
Subject: [PATCH 18/23] Fix

---
 pandas/tests/extension/test_string.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index fd8e06ce148ed..68433cf610f92 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -158,7 +158,11 @@ def test_fillna_no_op_returns_copy(self, data):
 
 class TestReduce(base.BaseReduceTests):
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
-        return op_name in ["min", "max"]
+        return (
+            op_name in ["min", "max"]
+            or ser.dtype.storage == "pyarrow_numpy"
+            and op_name in ("any", "all")
+        )
 
 
 class TestMethods(base.BaseMethodsTests):

From 032200612f139eadc34cd0c3068c5fd25c8912f0 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 21 Aug 2023 13:25:27 +0200
Subject: [PATCH 19/23] Move test

---
 pandas/tests/reductions/test_reductions.py | 18 ++++++++++++++++++
 pandas/tests/series/test_logical_ops.py    | 18 ------------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index 87892a81cef3d..8fb3338229f27 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -1639,3 +1639,21 @@ def test_multimode_complex(self, array, expected, dtype):
         # Complex numbers are sorted by their magnitude
         result = Series(array, dtype=dtype).mode()
         tm.assert_series_equal(result, expected)
+
+    def test_any_all(self):
+        # GH#54591
+        ser = Series(["", "a"], dtype="string[pyarrow_numpy]")
+        assert ser.any()
+        assert not ser.all()
+
+        ser = Series([None, "a"], dtype="string[pyarrow_numpy]")
+        assert ser.any()
+        assert not ser.all()
+
+        ser = Series([None, ""], dtype="string[pyarrow_numpy]")
+        assert not ser.any()
+        assert not ser.all()
+
+        ser = Series(["a", "b"], dtype="string[pyarrow_numpy]")
+        assert ser.any()
+        assert ser.all()
diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py
index dd2b90db69039..26046ef9ba295 100644
--- a/pandas/tests/series/test_logical_ops.py
+++ b/pandas/tests/series/test_logical_ops.py
@@ -513,21 +513,3 @@ def test_int_dtype_different_index_not_bool(self):
 
         result = ser1 ^ ser2
         tm.assert_series_equal(result, expected)
-
-    def test_any_all(self):
-        # GH#54591
-        ser = Series(["", "a"], dtype="string[pyarrow_numpy]")
-        assert ser.any()
-        assert not ser.all()
-
-        ser = Series([None, "a"], dtype="string[pyarrow_numpy]")
-        assert ser.any()
-        assert not ser.all()
-
-        ser = Series([None, ""], dtype="string[pyarrow_numpy]")
-        assert not ser.any()
-        assert not ser.all()
-
-        ser = Series(["a", "b"], dtype="string[pyarrow_numpy]")
-        assert ser.any()
-        assert ser.all()

From 8bb52f4ef37a36a87f136aa78e706b90b23a098f Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Mon, 21 Aug 2023 22:57:43 +0200
Subject: [PATCH 20/23] Skip test when no pa

---
 pandas/tests/reductions/test_reductions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index 8fb3338229f27..439f6681d7958 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -1642,6 +1642,7 @@ def test_multimode_complex(self, array, expected, dtype):
 
     def test_any_all(self):
         # GH#54591
+        pytest.importorskip("pyarrow")
         ser = Series(["", "a"], dtype="string[pyarrow_numpy]")
         assert ser.any()
         assert not ser.all()

From cc8e6f7929da99e20fe424aec7dd2bb2efb6c20b Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Tue, 22 Aug 2023 17:23:57 +0200
Subject: [PATCH 21/23] Fix typing

---
 pandas/core/arrays/string_.py         | 2 +-
 pandas/tests/extension/test_string.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 79e5beb3175dc..7c3930c694e2f 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -100,7 +100,7 @@ class StringDtype(StorageExtensionDtype):
 
     #: StringDtype().na_value uses pandas.NA
     @property
-    def na_value(self) -> libmissing.NAType | float:
+    def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
         if self.storage == "pyarrow_numpy":
             return np.nan
         else:
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 68433cf610f92..775be6be78636 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -160,7 +160,7 @@ class TestReduce(base.BaseReduceTests):
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
         return (
             op_name in ["min", "max"]
-            or ser.dtype.storage == "pyarrow_numpy"
+            or ser.dtype.storage == "pyarrow_numpy"  # type: ignore[union-attr]
             and op_name in ("any", "all")
         )
 

From 33355a719eb51f68c99cb26d072ddd96a74065bc Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Wed, 23 Aug 2023 23:12:11 +0200
Subject: [PATCH 22/23] Fix tests

---
 pandas/tests/arrays/string_/test_string.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 17730b5ff6647..24d8e43708b91 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -228,9 +228,9 @@ def test_comparison_methods_scalar(comparison_op, dtype):
     other = "a"
     result = getattr(a, op_name)(other)
     if dtype.storage == "pyarrow_numpy":
-        expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
+        expected = np.array([getattr(item, op_name)(other) for item in a])
         expected[1] = False
-        tm.assert_numpy_array_equal(result, expected)
+        tm.assert_numpy_array_equal(result, expected.astype(np.bool_))
     else:
         expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
         expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
@@ -244,7 +244,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
     result = getattr(a, op_name)(pd.NA)
 
     if dtype.storage == "pyarrow_numpy":
-        expected = np.array([False, False, False], dtype=object)
+        expected = np.array([False, False, False])
         tm.assert_numpy_array_equal(result, expected)
     else:
         expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
@@ -272,7 +272,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
             "__eq__": [False, False, False],
             "__ne__": [True, False, True],
         }[op_name]
-        expected = np.array(expected_data, dtype=object)
+        expected = np.array(expected_data)
         tm.assert_numpy_array_equal(result, expected)
     else:
         expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[
@@ -290,12 +290,12 @@ def test_comparison_methods_array(comparison_op, dtype):
     other = [None, None, "c"]
     result = getattr(a, op_name)(other)
     if dtype.storage == "pyarrow_numpy":
-        expected = np.array([False, False, False], dtype=object)
+        expected = np.array([False, False, False])
         expected[-1] = getattr(other[-1], op_name)(a[-1])
         tm.assert_numpy_array_equal(result, expected)
 
         result = getattr(a, op_name)(pd.NA)
-        expected = np.array([False, False, False], dtype=object)
+        expected = np.array([False, False, False])
         tm.assert_numpy_array_equal(result, expected)
 
     else:

From 1facb7921332640c1a421c691610bc2744410ff0 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 28 Aug 2023 11:13:32 +0200
Subject: [PATCH 23/23] move + rename test

---
 pandas/tests/reductions/test_reductions.py | 38 +++++++++++-----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index 439f6681d7958..021252500e814 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -1078,6 +1078,25 @@ def test_any_all_datetimelike(self):
         assert df.any().all()
         assert not df.all().any()
 
+    def test_any_all_pyarrow_string(self):
+        # GH#54591
+        pytest.importorskip("pyarrow")
+        ser = Series(["", "a"], dtype="string[pyarrow_numpy]")
+        assert ser.any()
+        assert not ser.all()
+
+        ser = Series([None, "a"], dtype="string[pyarrow_numpy]")
+        assert ser.any()
+        assert not ser.all()
+
+        ser = Series([None, ""], dtype="string[pyarrow_numpy]")
+        assert not ser.any()
+        assert not ser.all()
+
+        ser = Series(["a", "b"], dtype="string[pyarrow_numpy]")
+        assert ser.any()
+        assert ser.all()
+
     def test_timedelta64_analytics(self):
         # index min/max
         dti = date_range("2012-1-1", periods=3, freq="D")
@@ -1639,22 +1658,3 @@ def test_multimode_complex(self, array, expected, dtype):
         # Complex numbers are sorted by their magnitude
         result = Series(array, dtype=dtype).mode()
         tm.assert_series_equal(result, expected)
-
-    def test_any_all(self):
-        # GH#54591
-        pytest.importorskip("pyarrow")
-        ser = Series(["", "a"], dtype="string[pyarrow_numpy]")
-        assert ser.any()
-        assert not ser.all()
-
-        ser = Series([None, "a"], dtype="string[pyarrow_numpy]")
-        assert ser.any()
-        assert not ser.all()
-
-        ser = Series([None, ""], dtype="string[pyarrow_numpy]")
-        assert not ser.any()
-        assert not ser.all()
-
-        ser = Series(["a", "b"], dtype="string[pyarrow_numpy]")
-        assert ser.any()
-        assert ser.all()