From 63a7fc57f1548ceb164708889dc5254f64981c23 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 27 Apr 2024 10:13:33 +0200
Subject: [PATCH 01/12] String dtype: implement object-dtype based StringArray
 variant with NumPy semantics

---
 pandas/_libs/lib.pyx                          |   2 +-
 pandas/compat/__init__.py                     |   2 +
 pandas/compat/pyarrow.py                      |   2 +
 pandas/conftest.py                            |   2 +
 pandas/core/arrays/string_.py                 | 174 +++++++++++++++---
 pandas/core/config_init.py                    |   6 +-
 pandas/core/construction.py                   |   4 +-
 pandas/tests/arrays/string_/test_string.py    |  50 ++---
 .../tests/arrays/string_/test_string_arrow.py |   4 +-
 pandas/tests/extension/test_string.py         |   2 +-
 pandas/tests/series/test_constructors.py      |  27 ++-
 pandas/tests/strings/__init__.py              |   4 +-
 pandas/tests/strings/test_find_replace.py     |  18 +-
 13 files changed, 220 insertions(+), 77 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 5b6d83ba8e9ee..2199071e7ec4f 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2692,7 +2692,7 @@ def maybe_convert_objects(ndarray[object] objects,
         if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(storage="pyarrow_numpy")
+            dtype = StringDtype()
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
index caa00b205a29c..cb6cecb4bdf08 100644
--- a/pandas/compat/__init__.py
+++ b/pandas/compat/__init__.py
@@ -26,6 +26,7 @@
 import pandas.compat.compressors
 from pandas.compat.numpy import is_numpy_dev
 from pandas.compat.pyarrow import (
+    HAS_PYARROW,
     pa_version_under10p1,
     pa_version_under11p0,
     pa_version_under13p0,
@@ -189,6 +190,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
     "pa_version_under14p0",
     "pa_version_under14p1",
     "pa_version_under16p0",
+    "HAS_PYARROW",
     "IS64",
     "ISMUSL",
     "PY310",
diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py
index 5a96e5a4cc49a..2e0135a41d94d 100644
--- a/pandas/compat/pyarrow.py
+++ b/pandas/compat/pyarrow.py
@@ -16,6 +16,7 @@
     pa_version_under14p1 = _palv < Version("14.0.1")
     pa_version_under15p0 = _palv < Version("15.0.0")
     pa_version_under16p0 = _palv < Version("16.0.0")
+    HAS_PYARROW = True
 except ImportError:
     pa_version_under10p1 = True
     pa_version_under11p0 = True
@@ -25,3 +26,4 @@
     pa_version_under14p1 = True
     pa_version_under15p0 = True
     pa_version_under16p0 = True
+    HAS_PYARROW = False
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 21100178262c8..9890deb4084f8 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1292,6 +1292,7 @@ def nullable_string_dtype(request):
 @pytest.fixture(
     params=[
         "python",
+        "python_numpy",
         pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
         pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")),
     ]
@@ -1353,6 +1354,7 @@ def object_dtype(request):
     params=[
         "object",
         "string[python]",
+        "string[python_numpy]",
         pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
         pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
     ]
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 291cc2e62be62..7a4fa46e83c53 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+import operator
 from typing import (
     TYPE_CHECKING,
+    Any,
     ClassVar,
     Literal,
     cast,
@@ -9,7 +11,10 @@
 
 import numpy as np
 
-from pandas._config import get_option
+from pandas._config import (
+    get_option,
+    using_pyarrow_string_dtype,
+)
 
 from pandas._libs import (
     lib,
@@ -17,7 +22,10 @@
 )
 from pandas._libs.arrays import NDArrayBacked
 from pandas._libs.lib import ensure_string_array
-from pandas.compat import pa_version_under10p1
+from pandas.compat import (
+    HAS_PYARROW,
+    pa_version_under10p1,
+)
 from pandas.compat.numpy import function as nv
 from pandas.util._decorators import doc
 
@@ -81,7 +89,7 @@ class StringDtype(StorageExtensionDtype):
 
     Parameters
     ----------
-    storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
+    storage : {"python", "pyarrow", "python_numpy", "pyarrow_numpy"}, optional
         If not given, the value of ``pd.options.mode.string_storage``.
 
     Attributes
@@ -113,7 +121,7 @@ class StringDtype(StorageExtensionDtype):
     # follows NumPy semantics, which uses nan.
     @property
     def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
-        if self.storage == "pyarrow_numpy":
+        if self.storage in ("pyarrow_numpy", "python_numpy"):
             return np.nan
         else:
             return libmissing.NA
@@ -122,15 +130,17 @@ def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
 
     def __init__(self, storage=None) -> None:
         if storage is None:
-            infer_string = get_option("future.infer_string")
-            if infer_string:
-                storage = "pyarrow_numpy"
+            if using_pyarrow_string_dtype():
+                if HAS_PYARROW:
+                    storage = "pyarrow_numpy"
+                else:
+                    storage = "python_numpy"
             else:
                 storage = get_option("mode.string_storage")
-        if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
+        if storage not in {"python", "pyarrow", "python_numpy", "pyarrow_numpy"}:
             raise ValueError(
-                f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
-                f"Got {storage} instead."
+                "Storage must be 'python', 'pyarrow', 'python_numpy' or 'pyarrow_numpy'"
+                f". Got {storage} instead."
             )
         if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1:
             raise ImportError(
@@ -178,6 +188,8 @@ def construct_from_string(cls, string) -> Self:
             return cls()
         elif string == "string[python]":
             return cls(storage="python")
+        elif string == "string[python_numpy]":
+            return cls(storage="python_numpy")
         elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
         elif string == "string[pyarrow_numpy]":
@@ -207,6 +219,8 @@ def construct_array_type(  # type: ignore[override]
             return StringArray
         elif self.storage == "pyarrow":
             return ArrowStringArray
+        elif self.storage == "python_numpy":
+            return StringArrayNumpySemantics
         else:
             return ArrowStringArrayNumpySemantics
 
@@ -238,7 +252,7 @@ def __from_arrow__(
                 # convert chunk by chunk to numpy and concatenate then, to avoid
                 # overflow for large string data when concatenating the pyarrow arrays
                 arr = arr.to_numpy(zero_copy_only=False)
-                arr = ensure_string_array(arr, na_value=libmissing.NA)
+                arr = ensure_string_array(arr, na_value=self.na_value)
                 results.append(arr)
 
         if len(chunks) == 0:
@@ -248,11 +262,7 @@ def __from_arrow__(
 
         # Bypass validation inside StringArray constructor, see GH#47781
         new_string_array = StringArray.__new__(StringArray)
-        NDArrayBacked.__init__(
-            new_string_array,
-            arr,
-            StringDtype(storage="python"),
-        )
+        NDArrayBacked.__init__(new_string_array, arr, self)
         return new_string_array
 
 
@@ -360,6 +370,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
 
     # undo the NumpyExtensionArray hack
     _typ = "extension"
+    _storage = "python"
 
     def __init__(self, values, copy: bool = False) -> None:
         values = extract_array(values)
@@ -367,7 +378,7 @@ def __init__(self, values, copy: bool = False) -> None:
         super().__init__(values, copy=copy)
         if not isinstance(values, type(self)):
             self._validate()
-        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
+        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage))
 
     def _validate(self) -> None:
         """Validate that we only store NA or strings."""
@@ -385,22 +396,41 @@ def _validate(self) -> None:
         else:
             lib.convert_nans_to_NA(self._ndarray)
 
+    def _validate_scalar(self, value):
+        # used by NDArrayBackedExtensionIndex.insert
+        if isna(value):
+            return self.dtype.na_value
+        elif not isinstance(value, str):
+            raise TypeError(
+                f"Cannot set non-string value '{value}' into a string array."
+            )
+        return value
+
     @classmethod
     def _from_sequence(
         cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
     ) -> Self:
         if dtype and not (isinstance(dtype, str) and dtype == "string"):
             dtype = pandas_dtype(dtype)
-            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
+            assert isinstance(dtype, StringDtype) and dtype.storage in (
+                "python",
+                "python_numpy",
+            )
+        else:
+            if get_option("future.infer_string"):
+                dtype = StringDtype(storage="python_numpy")
+            else:
+                dtype = StringDtype(storage="python")
 
         from pandas.core.arrays.masked import BaseMaskedArray
 
+        na_value = dtype.na_value
         if isinstance(scalars, BaseMaskedArray):
             # avoid costly conversion to object dtype
             na_values = scalars._mask
             result = scalars._data
             result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
-            result[na_values] = libmissing.NA
+            result[na_values] = na_value
 
         else:
             if lib.is_pyarrow_array(scalars):
@@ -409,12 +439,12 @@ def _from_sequence(
                 #  zero_copy_only to True which caused problems see GH#52076
                 scalars = np.array(scalars)
             # convert non-na-likes to str, and nan-likes to StringDtype().na_value
-            result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
+            result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy)
 
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
         new_string_array = cls.__new__(cls)
-        NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
+        NDArrayBacked.__init__(new_string_array, result, dtype)
 
         return new_string_array
 
@@ -464,7 +494,7 @@ def __setitem__(self, key, value) -> None:
         # validate new items
         if scalar_value:
             if isna(value):
-                value = libmissing.NA
+                value = self.dtype.na_value
             elif not isinstance(value, str):
                 raise TypeError(
                     f"Cannot set non-string value '{value}' into a StringArray."
@@ -478,7 +508,7 @@ def __setitem__(self, key, value) -> None:
             mask = isna(value)
             if mask.any():
                 value = value.copy()
-                value[isna(value)] = libmissing.NA
+                value[isna(value)] = self.dtype.na_value
 
         super().__setitem__(key, value)
 
@@ -591,9 +621,9 @@ def _cmp_method(self, other, op):
 
         if op.__name__ in ops.ARITHMETIC_BINOPS:
             result = np.empty_like(self._ndarray, dtype="object")
-            result[mask] = libmissing.NA
+            result[mask] = self.dtype.na_value
             result[valid] = op(self._ndarray[valid], other)
-            return StringArray(result)
+            return self._from_backing_data(result)
         else:
             # logical
             result = np.zeros(len(self._ndarray), dtype="bool")
@@ -662,3 +692,97 @@ def _str_map(
             #    or .findall returns a list).
             # -> We don't know the result type. E.g. `.get` can return anything.
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
+
+
+class StringArrayNumpySemantics(StringArray):
+    _storage = "python_numpy"
+
+    @classmethod
+    def _from_sequence(
+        cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
+    ) -> Self:
+        if dtype is None:
+            dtype = StringDtype(storage="python_numpy")
+        return super()._from_sequence(scalars, dtype=dtype, copy=copy)
+
+    def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray:
+        # need to overrde NumpyExtensionArray._from_backing_data to ensure
+        # we always preserve the dtype
+        return NDArrayBacked._from_backing_data(self, arr)
+
+    def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
+        # the masked_reductions use pd.NA
+        if result is libmissing.NA:
+            return np.nan
+        return super()._wrap_reduction_result(axis, result)
+
+    def _cmp_method(self, other, op):
+        result = super()._cmp_method(other, op)
+        if op == operator.ne:
+            return result.to_numpy(np.bool_, na_value=True)
+        else:
+            return result.to_numpy(np.bool_, na_value=False)
+
+    def value_counts(self, dropna: bool = True) -> Series:
+        from pandas.core.algorithms import value_counts_internal as value_counts
+
+        result = value_counts(self._ndarray, sort=False, dropna=dropna)
+        result.index = result.index.astype(self.dtype)
+        return result
+
+    # ------------------------------------------------------------------------
+    # String methods interface
+    _str_na_value = np.nan
+
+    def _str_map(
+        self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
+    ):
+        if dtype is None:
+            dtype = self.dtype
+        if na_value is None:
+            na_value = self.dtype.na_value
+
+        mask = isna(self)
+        arr = np.asarray(self)
+        convert = convert and not np.all(mask)
+
+        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+            # if is_integer_dtype(dtype):
+            #     na_value = np.nan
+            # else:
+            #     na_value = False
+            try:
+                result = lib.map_infer_mask(
+                    arr,
+                    f,
+                    mask.view("uint8"),
+                    convert=False,
+                    na_value=na_value,
+                    dtype=np.dtype(cast(type, dtype)),
+                )
+                return result
+
+            except ValueError:
+                result = lib.map_infer_mask(
+                    arr,
+                    f,
+                    mask.view("uint8"),
+                    convert=False,
+                    na_value=na_value,
+                )
+                if convert and result.dtype == object:
+                    result = lib.maybe_convert_objects(result)
+                return result
+
+        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
+            # i.e. StringDtype
+            result = lib.map_infer_mask(
+                arr, f, mask.view("uint8"), convert=False, na_value=na_value
+            )
+            return type(self)(result)
+        else:
+            # This is when the result type is object. We reach this when
+            # -> We know the result type is truly object (e.g. .encode returns bytes
+            #    or .findall returns a list).
+            # -> We don't know the result type. E.g. `.get` can return anything.
+            return lib.map_infer_mask(arr, f, mask.view("uint8"))
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 46c9139c3456c..c31834253a2dc 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -460,7 +460,9 @@ def is_terminal() -> bool:
         "string_storage",
         "python",
         string_storage_doc,
-        validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]),
+        validator=is_one_of_factory(
+            ["python", "pyarrow", "python_numpy", "pyarrow_numpy"]
+        ),
     )
 
 
@@ -858,7 +860,7 @@ def register_converter_cb(key: str) -> None:
 with cf.config_prefix("future"):
     cf.register_option(
         "infer_string",
-        False,
+        True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False,
         "Whether to infer sequence of str objects as pyarrow string "
         "dtype, which will be the default in pandas 3.0 "
         "(at which point this option will be deprecated).",
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 2718e9819cdf8..09985cfe61e28 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -581,7 +581,7 @@ def sanitize_array(
         ):
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype("pyarrow_numpy")
+            dtype = StringDtype()
         data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
 
         return data
@@ -622,7 +622,7 @@ def sanitize_array(
             elif data.dtype.kind == "U" and using_pyarrow_string_dtype():
                 from pandas.core.arrays.string_ import StringDtype
 
-                dtype = StringDtype(storage="pyarrow_numpy")
+                dtype = StringDtype()
                 subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
 
             if subarr is data and copy:
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 597b407a29c94..e63f5d85afef1 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -14,6 +14,7 @@
 
 import pandas as pd
 import pandas._testing as tm
+from pandas.core.arrays.string_ import StringArrayNumpySemantics
 from pandas.core.arrays.string_arrow import (
     ArrowStringArray,
     ArrowStringArrayNumpySemantics,
@@ -21,7 +22,7 @@
 
 
 def na_val(dtype):
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.storage in ("python_numpy", "pyarrow_numpy"):
         return np.nan
     else:
         return pd.NA
@@ -41,13 +42,13 @@ def cls(dtype):
 
 def test_repr(dtype):
     df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)})
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.storage in ("python_numpy", "pyarrow_numpy"):
         expected = "     A\n0    a\n1  NaN\n2    b"
     else:
         expected = "      A\n0     a\n1  <NA>\n2     b"
     assert repr(df) == expected
 
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.storage in ("python_numpy", "pyarrow_numpy"):
         expected = "0      a\n1    NaN\n2      b\nName: A, dtype: string"
     else:
         expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
@@ -59,6 +60,9 @@ def test_repr(dtype):
     elif dtype.storage == "pyarrow_numpy":
         arr_name = "ArrowStringArrayNumpySemantics"
         expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
+    elif dtype.storage == "python_numpy":
+        arr_name = "StringArrayNumpySemantics"
+        expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
     else:
         arr_name = "StringArray"
         expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
@@ -74,14 +78,14 @@ def test_none_to_nan(cls, dtype):
 def test_setitem_validates(cls, dtype):
     arr = cls._from_sequence(["a", "b"], dtype=dtype)
 
-    if cls is pd.arrays.StringArray:
+    if dtype.storage in ("python", "python_numpy"):
         msg = "Cannot set non-string value '10' into a StringArray."
     else:
         msg = "Scalar must be NA or str"
     with pytest.raises(TypeError, match=msg):
         arr[0] = 10
 
-    if cls is pd.arrays.StringArray:
+    if dtype.storage in ("python", "python_numpy"):
         msg = "Must provide strings."
     else:
         msg = "Scalar must be NA or str"
@@ -225,7 +229,7 @@ def test_comparison_methods_scalar(comparison_op, dtype):
     a = pd.array(["a", None, "c"], dtype=dtype)
     other = "a"
     result = getattr(a, op_name)(other)
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.storage in ("python_numpy", "pyarrow_numpy"):
         expected = np.array([getattr(item, op_name)(other) for item in a])
         if comparison_op == operator.ne:
             expected[1] = True
@@ -244,7 +248,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
     a = pd.array(["a", None, "c"], dtype=dtype)
     result = getattr(a, op_name)(pd.NA)
 
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.storage in ("python_numpy", "pyarrow_numpy"):
         if operator.ne == comparison_op:
             expected = np.array([True, True, True])
         else:
@@ -271,7 +275,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
 
     result = getattr(a, op_name)(other)
 
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.storage in ("python_numpy", "pyarrow_numpy"):
         expected_data = {
             "__eq__": [False, False, False],
             "__ne__": [True, True, True],
@@ -293,7 +297,7 @@ def test_comparison_methods_array(comparison_op, dtype):
     a = pd.array(["a", None, "c"], dtype=dtype)
     other = [None, None, "c"]
     result = getattr(a, op_name)(other)
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.storage in ("python_numpy", "pyarrow_numpy"):
         if operator.ne == comparison_op:
             expected = np.array([True, True, False])
         else:
@@ -321,7 +325,7 @@ def test_comparison_methods_array(comparison_op, dtype):
 
 
 def test_constructor_raises(cls):
-    if cls is pd.arrays.StringArray:
+    if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics:
         msg = "StringArray requires a sequence of strings or pandas.NA"
     else:
         msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
@@ -332,7 +336,7 @@ def test_constructor_raises(cls):
     with pytest.raises(ValueError, match=msg):
         cls(np.array([]))
 
-    if cls is pd.arrays.StringArray:
+    if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics:
         # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs
         #  for string dtype
         cls(np.array(["a", np.nan], dtype=object))
@@ -387,7 +391,7 @@ def test_astype_int(dtype):
     tm.assert_numpy_array_equal(result, expected)
 
     arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.storage in ("python_numpy", "pyarrow_numpy"):
         err = ValueError
         msg = "cannot convert float NaN to integer"
     else:
@@ -492,7 +496,7 @@ def test_arrow_array(dtype):
     expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
     if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
         expected = pa.chunked_array(expected)
-    if dtype.storage == "python":
+    if dtype.storage in ("python", "python_numpy"):
         expected = pc.cast(expected, pa.string())
     assert arr.equals(expected)
 
@@ -502,7 +506,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     # roundtrip possible from arrow 1.0.0
     pa = pytest.importorskip("pyarrow")
 
-    if using_infer_string and string_storage2 != "pyarrow_numpy":
+    if using_infer_string and string_storage2 not in ("python_numpy", "pyarrow_numpy"):
         request.applymarker(
             pytest.mark.xfail(
                 reason="infer_string takes precedence over string storage"
@@ -512,7 +516,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     data = pd.array(["a", "b", None], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    if dtype.storage == "python":
+    if dtype.storage in ("python", "python_numpy"):
         assert table.field("a").type == "string"
     else:
         assert table.field("a").type == "large_string"
@@ -542,7 +546,7 @@ def test_arrow_load_from_zero_chunks(
     data = pd.array([], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    if dtype.storage == "python":
+    if dtype.storage in ("python", "python_numpy"):
         assert table.field("a").type == "string"
     else:
         assert table.field("a").type == "large_string"
@@ -558,7 +562,7 @@ def test_arrow_load_from_zero_chunks(
 def test_value_counts_na(dtype):
     if getattr(dtype, "storage", "") == "pyarrow":
         exp_dtype = "int64[pyarrow]"
-    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+    elif getattr(dtype, "storage", "") in ("python_numpy", "pyarrow_numpy"):
         exp_dtype = "int64"
     else:
         exp_dtype = "Int64"
@@ -575,7 +579,7 @@ def test_value_counts_na(dtype):
 def test_value_counts_with_normalize(dtype):
     if getattr(dtype, "storage", "") == "pyarrow":
         exp_dtype = "double[pyarrow]"
-    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+    elif getattr(dtype, "storage", "") in ("python_numpy", "pyarrow_numpy"):
         exp_dtype = np.float64
     else:
         exp_dtype = "Float64"
@@ -588,7 +592,7 @@ def test_value_counts_with_normalize(dtype):
 def test_value_counts_sort_false(dtype):
     if getattr(dtype, "storage", "") == "pyarrow":
         exp_dtype = "int64[pyarrow]"
-    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+    elif getattr(dtype, "storage", "") in ("python_numpy", "pyarrow_numpy"):
         exp_dtype = "int64"
     else:
         exp_dtype = "Int64"
@@ -641,7 +645,11 @@ def test_isin(dtype, fixed_now_ts):
     tm.assert_series_equal(result, expected)
 
     result = s.isin(["a", pd.NA])
-    expected = pd.Series([True, False, True])
+    if dtype.storage == "python_numpy":
+        # TODO what do we want here?
+        expected = pd.Series([True, False, False])
+    else:
+        expected = pd.Series([True, False, True])
     tm.assert_series_equal(result, expected)
 
     result = s.isin([])
@@ -665,7 +673,7 @@ def test_setitem_scalar_with_mask_validation(dtype):
 
     # for other non-string we should also raise an error
     ser = pd.Series(["a", "b", "c"], dtype=dtype)
-    if type(ser.array) is pd.arrays.StringArray:
+    if dtype.storage in ("python", "python_numpy"):
         msg = "Cannot set non-string value"
     else:
         msg = "Scalar must be NA or str"
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index 405c1c217b04d..90ca48a6a469d 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -27,7 +27,7 @@ def test_eq_all_na():
 
 
 def test_config(string_storage, request, using_infer_string):
-    if using_infer_string and string_storage != "pyarrow_numpy":
+    if using_infer_string and string_storage in ("python_numpy", "pyarrow_numpy"):
         request.applymarker(pytest.mark.xfail(reason="infer string takes precedence"))
     with pd.option_context("string_storage", string_storage):
         assert StringDtype().storage == string_storage
@@ -260,6 +260,6 @@ def test_pickle_roundtrip(dtype):
 def test_string_dtype_error_message():
     # GH#55051
     pytest.importorskip("pyarrow")
-    msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'."
+    msg = "Storage must be 'python', 'pyarrow', 'python_numpy' or 'pyarrow_numpy'."
     with pytest.raises(ValueError, match=msg):
         StringDtype("bla")
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 49ad3fce92a5c..7756ab40ebe56 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -200,7 +200,7 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
             cast_to = dtype
         elif dtype.storage == "pyarrow":
             cast_to = "boolean[pyarrow]"  # type: ignore[assignment]
-        elif dtype.storage == "pyarrow_numpy":
+        elif dtype.storage in ("python_numpy", "pyarrow_numpy"):
             cast_to = np.bool_  # type: ignore[assignment]
         else:
             cast_to = "boolean"  # type: ignore[assignment]
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 3f9d5bbe806bb..d9c45e1498d5c 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -14,6 +14,7 @@
     iNaT,
     lib,
 )
+from pandas.compat import HAS_PYARROW
 from pandas.compat.numpy import np_version_gt2
 from pandas.errors import IntCastingNaNError
 
@@ -2079,11 +2080,10 @@ def test_series_from_index_dtype_equal_does_not_copy(self):
 
     def test_series_string_inference(self):
         # GH#54430
-        pytest.importorskip("pyarrow")
-        dtype = "string[pyarrow_numpy]"
-        expected = Series(["a", "b"], dtype=dtype)
         with pd.option_context("future.infer_string", True):
             ser = Series(["a", "b"])
+        dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]"
+        expected = Series(["a", "b"], dtype=dtype)
         tm.assert_series_equal(ser, expected)
 
         expected = Series(["a", 1], dtype="object")
@@ -2094,35 +2094,34 @@ def test_series_string_inference(self):
     @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA])
     def test_series_string_with_na_inference(self, na_value):
         # GH#54430
-        pytest.importorskip("pyarrow")
-        dtype = "string[pyarrow_numpy]"
-        expected = Series(["a", na_value], dtype=dtype)
         with pd.option_context("future.infer_string", True):
             ser = Series(["a", na_value])
+        dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]"
+        expected = Series(["a", None], dtype=dtype)
         tm.assert_series_equal(ser, expected)
 
     def test_series_string_inference_scalar(self):
         # GH#54430
-        pytest.importorskip("pyarrow")
-        expected = Series("a", index=[1], dtype="string[pyarrow_numpy]")
         with pd.option_context("future.infer_string", True):
             ser = Series("a", index=[1])
+        dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]"
+        expected = Series("a", index=[1], dtype=dtype)
         tm.assert_series_equal(ser, expected)
 
     def test_series_string_inference_array_string_dtype(self):
         # GH#54496
-        pytest.importorskip("pyarrow")
-        expected = Series(["a", "b"], dtype="string[pyarrow_numpy]")
         with pd.option_context("future.infer_string", True):
             ser = Series(np.array(["a", "b"]))
+        dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]"
+        expected = Series(["a", "b"], dtype=dtype)
         tm.assert_series_equal(ser, expected)
 
     def test_series_string_inference_storage_definition(self):
         # GH#54793
-        pytest.importorskip("pyarrow")
-        expected = Series(["a", "b"], dtype="string[pyarrow_numpy]")
         with pd.option_context("future.infer_string", True):
             result = Series(["a", "b"], dtype="string")
+        dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]"
+        expected = Series(["a", "b"], dtype=dtype)
         tm.assert_series_equal(result, expected)
 
     def test_series_constructor_infer_string_scalar(self):
@@ -2135,10 +2134,10 @@ def test_series_constructor_infer_string_scalar(self):
 
     def test_series_string_inference_na_first(self):
         # GH#55655
-        pytest.importorskip("pyarrow")
-        expected = Series([pd.NA, "b"], dtype="string[pyarrow_numpy]")
         with pd.option_context("future.infer_string", True):
             result = Series([pd.NA, "b"])
+        dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]"
+        expected = Series([None, "b"], dtype=dtype)
         tm.assert_series_equal(result, expected)
 
     def test_inference_on_pandas_objects(self):
diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py
index 01b49b5e5b633..4b3cc125fdf7d 100644
--- a/pandas/tests/strings/__init__.py
+++ b/pandas/tests/strings/__init__.py
@@ -2,12 +2,12 @@
 
 import pandas as pd
 
-object_pyarrow_numpy = ("object", "string[pyarrow_numpy]")
+object_pyarrow_numpy = ("object", "string[python_numpy]", "string[pyarrow_numpy]")
 
 
 def _convert_na_value(ser, expected):
     if ser.dtype != object:
-        if ser.dtype.storage == "pyarrow_numpy":
+        if ser.dtype.storage in ("python_numpy", "pyarrow_numpy"):
             expected = expected.fillna(np.nan)
         else:
             # GH#18463
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index fb308b72e47f5..2ac362fbe1aea 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -220,14 +220,18 @@ def test_contains_nan(any_string_dtype):
     expected = Series([True, True, True], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
-    result = s.str.contains("foo", na="foo")
-    if any_string_dtype == "object":
-        expected = Series(["foo", "foo", "foo"], dtype=np.object_)
-    elif any_string_dtype == "string[pyarrow_numpy]":
-        expected = Series([True, True, True], dtype=np.bool_)
+    if any_string_dtype == "string[python_numpy]":
+        with pytest.raises(TypeError):
+            result = s.str.contains("foo", na="foo")
     else:
-        expected = Series([True, True, True], dtype="boolean")
-    tm.assert_series_equal(result, expected)
+        result = s.str.contains("foo", na="foo")
+        if any_string_dtype == "object":
+            expected = Series(["foo", "foo", "foo"], dtype=np.object_)
+        elif any_string_dtype == "string[pyarrow_numpy]":
+            expected = Series([True, True, True], dtype=np.bool_)
+        else:
+            expected = Series([True, True, True], dtype="boolean")
+        tm.assert_series_equal(result, expected)
 
     result = s.str.contains("foo")
     expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"

From 0eee6254c198266687fe6ebe02ef385cacbd31c3 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 27 Apr 2024 20:27:13 +0200
Subject: [PATCH 02/12] fix constructor to not convert to NA

---
 pandas/_testing/asserters.py               | 10 ++++++++++
 pandas/core/arrays/string_.py              | 13 +++++++++++++
 pandas/tests/arrays/string_/test_string.py |  6 +++++-
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
index 543d7944e4c5d..ba7c28ba18c53 100644
--- a/pandas/_testing/asserters.py
+++ b/pandas/_testing/asserters.py
@@ -805,6 +805,16 @@ def assert_extension_array_equal(
         left_na, right_na, obj=f"{obj} NA mask", index_values=index_values
     )
 
+    # Specifically for StringArrayNumpySemantics, validate here we have a valid array
+    if isinstance(left.dtype, StringDtype) and left.dtype.storage == "python_numpy":
+        assert np.all(
+            [np.isnan(val) for val in left._ndarray[left_na]]
+        ), "wrong missing value sentinels"
+    if isinstance(right.dtype, StringDtype) and right.dtype.storage == "python_numpy":
+        assert np.all(
+            [np.isnan(val) for val in right._ndarray[right_na]]
+        ), "wrong missing value sentinels"
+
     left_valid = left[~left_na].to_numpy(dtype=object)
     right_valid = right[~right_na].to_numpy(dtype=object)
     if check_exact:
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 7a4fa46e83c53..f23a34f3298dd 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -697,6 +697,19 @@ def _str_map(
 class StringArrayNumpySemantics(StringArray):
     _storage = "python_numpy"
 
+    def _validate(self) -> None:
+        """Validate that we only store NaN or strings."""
+        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
+            raise ValueError(
+                "StringArrayNumpySemantics requires a sequence of strings or NaN"
+            )
+        if self._ndarray.dtype != "object":
+            raise ValueError(
+                "StringArrayNumpySemantics requires a sequence of strings or NaN. Got "
+                f"'{self._ndarray.dtype}' dtype instead."
+            )
+        # TODO validate or force NA/None to NaN
+
     @classmethod
     def _from_sequence(
         cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index e63f5d85afef1..e88beb73bacff 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -325,8 +325,10 @@ def test_comparison_methods_array(comparison_op, dtype):
 
 
 def test_constructor_raises(cls):
-    if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics:
+    if cls is pd.arrays.StringArray:
         msg = "StringArray requires a sequence of strings or pandas.NA"
+    elif cls is StringArrayNumpySemantics:
+        msg = "StringArrayNumpySemantics requires a sequence of strings or NaN"
     else:
         msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
 
@@ -377,6 +379,8 @@ def test_from_sequence_no_mutate(copy, cls, dtype):
         import pyarrow as pa
 
         expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True))
+    elif cls is StringArrayNumpySemantics:
+        expected = cls(nan_arr)
     else:
         expected = cls(na_arr)
 

From 607b95e376528ca690988716edf8034158f227c1 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 27 Apr 2024 20:34:49 +0200
Subject: [PATCH 03/12] fix typing

---
 pandas/_testing/asserters.py  | 4 ++--
 pandas/core/arrays/string_.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
index ba7c28ba18c53..927003b0e955b 100644
--- a/pandas/_testing/asserters.py
+++ b/pandas/_testing/asserters.py
@@ -808,11 +808,11 @@ def assert_extension_array_equal(
     # Specifically for StringArrayNumpySemantics, validate here we have a valid array
     if isinstance(left.dtype, StringDtype) and left.dtype.storage == "python_numpy":
         assert np.all(
-            [np.isnan(val) for val in left._ndarray[left_na]]
+            [np.isnan(val) for val in left._ndarray[left_na]]  # type: ignore[attr-defined]
         ), "wrong missing value sentinels"
     if isinstance(right.dtype, StringDtype) and right.dtype.storage == "python_numpy":
         assert np.all(
-            [np.isnan(val) for val in right._ndarray[right_na]]
+            [np.isnan(val) for val in right._ndarray[right_na]]  # type: ignore[attr-defined]
         ), "wrong missing value sentinels"
 
     left_valid = left[~left_na].to_numpy(dtype=object)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index f23a34f3298dd..8aec02d2902aa 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -718,7 +718,7 @@ def _from_sequence(
             dtype = StringDtype(storage="python_numpy")
         return super()._from_sequence(scalars, dtype=dtype, copy=copy)
 
-    def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray:
+    def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics:
         # need to overrde NumpyExtensionArray._from_backing_data to ensure
         # we always preserve the dtype
         return NDArrayBacked._from_backing_data(self, arr)

From bca157daeb757f80e684d0bc1d36c3f3c0a6d45b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 27 Apr 2024 20:55:56 +0200
Subject: [PATCH 04/12] improve logic in str_map

---
 pandas/core/arrays/string_.py | 48 ++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 8aec02d2902aa..224bea4ac7b91 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -760,32 +760,28 @@ def _str_map(
         convert = convert and not np.all(mask)
 
         if is_integer_dtype(dtype) or is_bool_dtype(dtype):
-            # if is_integer_dtype(dtype):
-            #     na_value = np.nan
-            # else:
-            #     na_value = False
-            try:
-                result = lib.map_infer_mask(
-                    arr,
-                    f,
-                    mask.view("uint8"),
-                    convert=False,
-                    na_value=na_value,
-                    dtype=np.dtype(cast(type, dtype)),
-                )
-                return result
-
-            except ValueError:
-                result = lib.map_infer_mask(
-                    arr,
-                    f,
-                    mask.view("uint8"),
-                    convert=False,
-                    na_value=na_value,
-                )
-                if convert and result.dtype == object:
-                    result = lib.maybe_convert_objects(result)
-                return result
+            na_value_is_na = isna(na_value)
+            if na_value_is_na:
+                if is_integer_dtype(dtype):
+                    na_value = 0
+                else:
+                    na_value = True
+
+            result = lib.map_infer_mask(
+                arr,
+                f,
+                mask.view("uint8"),
+                convert=False,
+                na_value=na_value,
+                dtype=np.dtype(cast(type, dtype)),
+            )
+            if na_value_is_na and mask.any():
+                if is_integer_dtype(dtype):
+                    result = result.astype("float64")
+                else:
+                    result = result.astype("object")
+                result[mask] = np.nan
+            return result
 
         elif is_string_dtype(dtype) and not is_object_dtype(dtype):
             # i.e. StringDtype

From ab96aa4734998dd8e44c75f3951973f7c8b24743 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 30 Jul 2024 09:28:47 +0200
Subject: [PATCH 05/12] remove most usage of python_numpy

---
 pandas/_testing/asserters.py                  | 20 +++++++++---
 pandas/core/arrays/string_.py                 | 32 ++++++++++++-------
 pandas/tests/arrays/string_/test_string.py    | 16 +++++-----
 .../tests/arrays/string_/test_string_arrow.py |  2 +-
 pandas/tests/extension/test_string.py         |  2 +-
 5 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
index 82fc4d51c5724..f627272de781b 100644
--- a/pandas/_testing/asserters.py
+++ b/pandas/_testing/asserters.py
@@ -578,13 +578,17 @@ def raise_assert_detail(
 
     if isinstance(left, np.ndarray):
         left = pprint_thing(left)
-    elif isinstance(left, (CategoricalDtype, NumpyEADtype, StringDtype)):
+    elif isinstance(left, (CategoricalDtype, NumpyEADtype)):
         left = repr(left)
+    elif isinstance(left, StringDtype):
+        left = f"StringDtype(storage={left.storage}, na_value={left.na_value})"
 
     if isinstance(right, np.ndarray):
         right = pprint_thing(right)
-    elif isinstance(right, (CategoricalDtype, NumpyEADtype, StringDtype)):
+    elif isinstance(right, (CategoricalDtype, NumpyEADtype)):
         right = repr(right)
+    elif isinstance(right, StringDtype):
+        right = f"StringDtype(storage={right.storage}, na_value={right.na_value})"
 
     msg += f"""
 [left]:  {left}
@@ -791,11 +795,19 @@ def assert_extension_array_equal(
     )
 
     # Specifically for StringArrayNumpySemantics, validate here we have a valid array
-    if isinstance(left.dtype, StringDtype) and left.dtype.storage == "python_numpy":
+    if (
+        isinstance(left.dtype, StringDtype)
+        and left.dtype.storage == "python"
+        and left.dtype.na_value is np.nan
+    ):
         assert np.all(
             [np.isnan(val) for val in left._ndarray[left_na]]  # type: ignore[attr-defined]
         ), "wrong missing value sentinels"
-    if isinstance(right.dtype, StringDtype) and right.dtype.storage == "python_numpy":
+    if (
+        isinstance(right.dtype, StringDtype)
+        and right.dtype.storage == "python"
+        and right.dtype.na_value is np.nan
+    ):
         assert np.all(
             [np.isnan(val) for val in right._ndarray[right_na]]  # type: ignore[attr-defined]
         ), "wrong missing value sentinels"
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 775e6cf47d1e8..b1751a24f1c4f 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -146,6 +146,10 @@ def __init__(
             # TODO raise a deprecation warning
             storage = "pyarrow"
             na_value = np.nan
+        if storage == "python_numpy":
+            # TODO remove
+            storage = "python"
+            na_value = np.nan
 
         # validate options
         if storage not in {"python", "pyarrow"}:
@@ -229,7 +233,8 @@ def construct_from_string(cls, string) -> Self:
         elif string == "string[python]":
             return cls(storage="python")
         elif string == "string[python_numpy]":
-            return cls(storage="python_numpy")
+            # TODO remove
+            return cls(storage="python", na_value=np.nan)
         elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
         elif string == "string[pyarrow_numpy]":
@@ -256,11 +261,11 @@ def construct_array_type(  # type: ignore[override]
             ArrowStringArrayNumpySemantics,
         )
 
-        if self.storage == "python":
+        if self.storage == "python" and self._na_value is libmissing.NA:
             return StringArray
         elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
             return ArrowStringArray
-        elif self.storage == "python_numpy":
+        elif self.storage == "python":
             return StringArrayNumpySemantics
         else:
             return ArrowStringArrayNumpySemantics
@@ -416,6 +421,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
     # undo the NumpyExtensionArray hack
     _typ = "extension"
     _storage = "python"
+    _na_value = libmissing.NA
 
     def __init__(self, values, copy: bool = False) -> None:
         values = extract_array(values)
@@ -423,7 +429,11 @@ def __init__(self, values, copy: bool = False) -> None:
         super().__init__(values, copy=copy)
         if not isinstance(values, type(self)):
             self._validate()
-        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage))
+        NDArrayBacked.__init__(
+            self,
+            self._ndarray,
+            StringDtype(storage=self._storage, na_value=self._na_value),
+        )
 
     def _validate(self) -> None:
         """Validate that we only store NA or strings."""
@@ -457,13 +467,10 @@ def _from_sequence(
     ) -> Self:
         if dtype and not (isinstance(dtype, str) and dtype == "string"):
             dtype = pandas_dtype(dtype)
-            assert isinstance(dtype, StringDtype) and dtype.storage in (
-                "python",
-                "python_numpy",
-            )
+            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
         else:
-            if get_option("future.infer_string"):
-                dtype = StringDtype(storage="python_numpy")
+            if using_string_dtype():
+                dtype = StringDtype(storage="python", na_value=np.nan)
             else:
                 dtype = StringDtype(storage="python")
 
@@ -749,7 +756,8 @@ def _str_map(
 
 
 class StringArrayNumpySemantics(StringArray):
-    _storage = "python_numpy"
+    _storage = "python"
+    _na_value = np.nan
 
     def _validate(self) -> None:
         """Validate that we only store NaN or strings."""
@@ -769,7 +777,7 @@ def _from_sequence(
         cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
     ) -> Self:
         if dtype is None:
-            dtype = StringDtype(storage="python_numpy")
+            dtype = StringDtype(storage="python", na_value=np.nan)
         return super()._from_sequence(scalars, dtype=dtype, copy=copy)
 
     def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics:
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index bde5aaa8ecf5b..62632d4cf17b7 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -74,7 +74,7 @@ def test_repr(dtype):
     elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
         arr_name = "ArrowStringArrayNumpySemantics"
         expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
-    elif dtype.storage == "python_numpy":
+    elif dtype.storage == "python" and dtype.na_value is np.nan:
         arr_name = "StringArrayNumpySemantics"
         expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
     else:
@@ -92,14 +92,14 @@ def test_none_to_nan(cls, dtype):
 def test_setitem_validates(cls, dtype):
     arr = cls._from_sequence(["a", "b"], dtype=dtype)
 
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         msg = "Cannot set non-string value '10' into a StringArray."
     else:
         msg = "Scalar must be NA or str"
     with pytest.raises(TypeError, match=msg):
         arr[0] = 10
 
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         msg = "Must provide strings."
     else:
         msg = "Scalar must be NA or str"
@@ -514,7 +514,7 @@ def test_arrow_array(dtype):
     expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
     if dtype.storage == "pyarrow" and pa_version_under12p0:
         expected = pa.chunked_array(expected)
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         expected = pc.cast(expected, pa.string())
     assert arr.equals(expected)
 
@@ -534,7 +534,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     data = pd.array(["a", "b", None], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         assert table.field("a").type == "string"
     else:
         assert table.field("a").type == "large_string"
@@ -564,7 +564,7 @@ def test_arrow_load_from_zero_chunks(
     data = pd.array([], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         assert table.field("a").type == "string"
     else:
         assert table.field("a").type == "large_string"
@@ -663,7 +663,7 @@ def test_isin(dtype, fixed_now_ts):
     tm.assert_series_equal(result, expected)
 
     result = s.isin(["a", pd.NA])
-    if dtype.storage == "python_numpy":
+    if dtype.storage == "python" and dtype.na_value is np.nan:
         # TODO what do we want here?
         expected = pd.Series([True, False, False])
     else:
@@ -691,7 +691,7 @@ def test_setitem_scalar_with_mask_validation(dtype):
 
     # for other non-string we should also raise an error
     ser = pd.Series(["a", "b", "c"], dtype=dtype)
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         msg = "Cannot set non-string value"
     else:
         msg = "Scalar must be NA or str"
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index a2a17c9d3b938..fca8a0b39135b 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -29,7 +29,7 @@ def test_eq_all_na():
 def test_config(string_storage, request, using_infer_string):
     if using_infer_string and string_storage in ("python_numpy", "pyarrow_numpy"):
         request.applymarker(pytest.mark.xfail(reason="infer string takes precedence"))
-    if string_storage == "pyarrow_numpy":
+    if string_storage in ("pyarrow_numpy", "python_numpy"):
         request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
     with pd.option_context("string_storage", string_storage):
         assert StringDtype().storage == string_storage
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 64b383ded97b5..a747b9c30bb7f 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -192,7 +192,7 @@ def _get_expected_exception(
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
         return (
             op_name in ["min", "max"]
-            or ser.dtype.na_value is np.nan  # type: ignore[union-attr]
+            or (ser.dtype.storage == "pyarrow" and ser.dtype.na_value is np.nan)  # type: ignore[union-attr]
             and op_name in ("any", "all")
         )
 

From bae8d65b5f0b97511b2320fa0d133905e8e0e1ba Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 30 Jul 2024 09:54:09 +0200
Subject: [PATCH 06/12] update tests to avoid string[python_numpy]

---
 pandas/tests/series/test_constructors.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 113b3b9f4a93b..e66dc824b059c 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -2077,7 +2077,7 @@ def test_series_string_inference(self):
         # GH#54430
         with pd.option_context("future.infer_string", True):
             ser = Series(["a", "b"])
-        dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]"
+        dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan)
         expected = Series(["a", "b"], dtype=dtype)
         tm.assert_series_equal(ser, expected)
 
@@ -2091,7 +2091,7 @@ def test_series_string_with_na_inference(self, na_value):
         # GH#54430
         with pd.option_context("future.infer_string", True):
             ser = Series(["a", na_value])
-        dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]"
+        dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan)
         expected = Series(["a", None], dtype=dtype)
         tm.assert_series_equal(ser, expected)
 
@@ -2099,7 +2099,7 @@ def test_series_string_inference_scalar(self):
         # GH#54430
         with pd.option_context("future.infer_string", True):
             ser = Series("a", index=[1])
-        dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]"
+        dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan)
         expected = Series("a", index=[1], dtype=dtype)
         tm.assert_series_equal(ser, expected)
 
@@ -2107,7 +2107,7 @@ def test_series_string_inference_array_string_dtype(self):
         # GH#54496
         with pd.option_context("future.infer_string", True):
             ser = Series(np.array(["a", "b"]))
-        dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]"
+        dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan)
         expected = Series(["a", "b"], dtype=dtype)
         tm.assert_series_equal(ser, expected)
 
@@ -2134,7 +2134,7 @@ def test_series_string_inference_na_first(self):
         # GH#55655
         with pd.option_context("future.infer_string", True):
             result = Series([pd.NA, "b"])
-        dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]"
+        dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan)
         expected = Series([None, "b"], dtype=dtype)
         tm.assert_series_equal(result, expected)
 

From 864c166751087aa20a1be8c93321583f12ea61f6 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 1 Aug 2024 21:12:10 +0200
Subject: [PATCH 07/12] remove all python_numpy usage

---
 pandas/conftest.py                        |  2 +-
 pandas/core/arrays/string_.py             |  7 -------
 pandas/tests/strings/test_find_replace.py | 18 +++++++-----------
 3 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index c6f0afd90d036..7c485515f0784 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1295,7 +1295,6 @@ def nullable_string_dtype(request):
 @pytest.fixture(
     params=[
         "python",
-        "python_numpy",
         pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
     ]
 )
@@ -1314,6 +1313,7 @@ def string_storage(request):
         ("python", pd.NA),
         pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
         pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
+        ("python", np.nan),
     ]
 )
 def string_dtype_arguments(request):
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2040ebf992981..59621647ea6bd 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -146,10 +146,6 @@ def __init__(
             # TODO raise a deprecation warning
             storage = "pyarrow"
             na_value = np.nan
-        if storage == "python_numpy":
-            # TODO remove
-            storage = "python"
-            na_value = np.nan
 
         # validate options
         if storage not in {"python", "pyarrow"}:
@@ -234,9 +230,6 @@ def construct_from_string(cls, string) -> Self:
             return cls()
         elif string == "string[python]":
             return cls(storage="python")
-        elif string == "string[python_numpy]":
-            # TODO remove
-            return cls(storage="python", na_value=np.nan)
         elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
         elif string == "string[pyarrow_numpy]":
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index 4c977dd640ac1..00677ef4fcfe9 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -232,18 +232,14 @@ def test_contains_nan(any_string_dtype):
     expected = Series([True, True, True], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
-    if any_string_dtype == "string[python_numpy]":
-        with pytest.raises(TypeError):
-            result = s.str.contains("foo", na="foo")
+    result = s.str.contains("foo", na="foo")
+    if any_string_dtype == "object":
+        expected = Series(["foo", "foo", "foo"], dtype=np.object_)
+    elif any_string_dtype.na_value is np.nan:
+        expected = Series([True, True, True], dtype=np.bool_)
     else:
-        result = s.str.contains("foo", na="foo")
-        if any_string_dtype == "object":
-            expected = Series(["foo", "foo", "foo"], dtype=np.object_)
-        elif any_string_dtype == "string[pyarrow_numpy]":
-            expected = Series([True, True, True], dtype=np.bool_)
-        else:
-            expected = Series([True, True, True], dtype="boolean")
-        tm.assert_series_equal(result, expected)
+        expected = Series([True, True, True], dtype="boolean")
+    tm.assert_series_equal(result, expected)
 
     result = s.str.contains("foo")
     expected_dtype = (

From d3ad7b02208a036635455f953da514591f63fdcd Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 2 Aug 2024 16:39:19 +0200
Subject: [PATCH 08/12] remove hardcoded storage

---
 pandas/core/dtypes/cast.py            | 2 +-
 pandas/core/internals/construction.py | 2 +-
 pandas/io/_util.py                    | 4 ++--
 pandas/io/pytables.py                 | 7 ++++---
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index d750451a1ca84..162f6a4d30f3f 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
         if using_string_dtype():
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(storage="pyarrow", na_value=np.nan)
+            dtype = StringDtype(na_value=np.nan)
 
     elif isinstance(val, (np.datetime64, dt.datetime)):
         try:
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 08e1650a5de12..535397871588c 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -302,7 +302,7 @@ def ndarray_to_mgr(
             nb = new_block_2d(values, placement=bp, refs=refs)
             block_values = [nb]
     elif dtype is None and values.dtype.kind == "U" and using_string_dtype():
-        dtype = StringDtype(storage="pyarrow", na_value=np.nan)
+        dtype = StringDtype(na_value=np.nan)
 
         obj_columns = list(values)
         block_values = [
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
index a72a16269959d..f502f827faa4e 100644
--- a/pandas/io/_util.py
+++ b/pandas/io/_util.py
@@ -34,6 +34,6 @@ def arrow_string_types_mapper() -> Callable:
     pa = import_optional_dependency("pyarrow")
 
     return {
-        pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan),
-        pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan),
+        pa.string(): pd.StringDtype(na_value=np.nan),
+        pa.large_string(): pd.StringDtype(na_value=np.nan),
     }.get
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 4b569fb7e39e2..618254fee9259 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -75,6 +75,7 @@
     PeriodIndex,
     RangeIndex,
     Series,
+    StringDtype,
     TimedeltaIndex,
     concat,
     isna,
@@ -3295,7 +3296,7 @@ def read(
         values = self.read_array("values", start=start, stop=stop)
         result = Series(values, index=index, name=self.name, copy=False)
         if using_string_dtype() and is_string_array(values, skipna=True):
-            result = result.astype("string[pyarrow_numpy]")
+            result = result.astype(StringDtype(na_value=np.nan))
         return result
 
     def write(self, obj, **kwargs) -> None:
@@ -3364,7 +3365,7 @@ def read(
             columns = items[items.get_indexer(blk_items)]
             df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
             if using_string_dtype() and is_string_array(values, skipna=True):
-                df = df.astype("string[pyarrow_numpy]")
+                df = df.astype(StringDtype(na_value=np.nan))
             dfs.append(df)
 
         if len(dfs) > 0:
@@ -4741,7 +4742,7 @@ def read(
                 values,  # type: ignore[arg-type]
                 skipna=True,
             ):
-                df = df.astype("string[pyarrow_numpy]")
+                df = df.astype(StringDtype(na_value=np.nan))
             frames.append(df)
 
         if len(frames) == 1:

From 028dc2c6a37c2a5876646df66965cdb4434a651c Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 2 Aug 2024 16:52:42 +0200
Subject: [PATCH 09/12] implement any/all reductions

---
 pandas/core/arrays/string_.py         | 16 +++++++++++++++-
 pandas/tests/extension/test_string.py |  2 +-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 59621647ea6bd..c3724be2181b3 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -43,7 +43,10 @@
     pandas_dtype,
 )
 
-from pandas.core import ops
+from pandas.core import (
+    nanops,
+    ops,
+)
 from pandas.core.array_algos import masked_reductions
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.arrays.floating import (
@@ -780,6 +783,17 @@ def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics:
         # we always preserve the dtype
         return NDArrayBacked._from_backing_data(self, arr)
 
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
+        if name in ["any", "all"]:
+            if name == "any":
+                return nanops.nanany(self._ndarray, skipna=skipna)
+            else:
+                return nanops.nanall(self._ndarray, skipna=skipna)
+        else:
+            return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
+
     def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
         # the masked_reductions use pd.NA
         if result is libmissing.NA:
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 9ee506f560a32..2ab248787a1cf 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -193,7 +193,7 @@ def _get_expected_exception(
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
         return (
             op_name in ["min", "max"]
-            or (ser.dtype.storage == "pyarrow" and ser.dtype.na_value is np.nan)  # type: ignore[union-attr]
+            or ser.dtype.na_value is np.nan  # type: ignore[union-attr]
             and op_name in ("any", "all")
         )
 

From 7f4baf79bd7633825df230920ffb9ef7008ceeb2 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 3 Aug 2024 12:42:48 +0200
Subject: [PATCH 10/12] fix typing

---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index c3724be2181b3..0a8988eeca480 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -419,7 +419,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
     # undo the NumpyExtensionArray hack
     _typ = "extension"
     _storage = "python"
-    _na_value = libmissing.NA
+    _na_value: libmissing.NAType | float = libmissing.NA
 
     def __init__(self, values, copy: bool = False) -> None:
         values = extract_array(values)

From fe6fce693f51aed7c095d1e2ba78f34c581b809e Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 7 Aug 2024 14:36:02 +0200
Subject: [PATCH 11/12] Update pandas/core/arrays/string_.py

Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 0a8988eeca480..4fa33977b579d 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -779,7 +779,7 @@ def _from_sequence(
         return super()._from_sequence(scalars, dtype=dtype, copy=copy)
 
     def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics:
-        # need to overrde NumpyExtensionArray._from_backing_data to ensure
+        # need to override NumpyExtensionArray._from_backing_data to ensure
         # we always preserve the dtype
         return NDArrayBacked._from_backing_data(self, arr)
 

From 70325d4d9bf5003ea82d3929ff9fa718261e8b9b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 7 Aug 2024 14:45:47 +0200
Subject: [PATCH 12/12] update todo comment

---
 pandas/tests/arrays/string_/test_string.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index f57a0f804dc02..3688d2998b3c7 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -653,7 +653,7 @@ def test_isin(dtype, fixed_now_ts):
 
     result = s.isin(["a", pd.NA])
     if dtype.storage == "python" and dtype.na_value is np.nan:
-        # TODO what do we want here?
+        # TODO(infer_string) we should make this consistent
         expected = pd.Series([True, False, False])
     else:
         expected = pd.Series([True, False, True])