From 2f1bc376892afa4b532ed1f70de98d3a72da4487 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 20 May 2024 17:32:40 +0200
Subject: [PATCH 01/14] rename storage option and add na_value keyword

---
 pandas/core/arrays/string_.py | 77 ++++++++++++++++++++++++-----------
 1 file changed, 53 insertions(+), 24 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 291cc2e62be62..3ad3b95909e3b 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -9,7 +9,10 @@
 
 import numpy as np
 
-from pandas._config import get_option
+from pandas._config import (
+    get_option,
+    using_pyarrow_string_dtype,
+)
 
 from pandas._libs import (
     lib,
@@ -83,6 +86,7 @@ class StringDtype(StorageExtensionDtype):
     ----------
     storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
         If not given, the value of ``pd.options.mode.string_storage``.
+    na_value :
 
     Attributes
     ----------
@@ -113,30 +117,49 @@ class StringDtype(StorageExtensionDtype):
     # follows NumPy semantics, which uses nan.
     @property
     def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
-        if self.storage == "pyarrow_numpy":
-            return np.nan
-        else:
-            return libmissing.NA
+        return self._na_value
 
     _metadata = ("storage",)
 
-    def __init__(self, storage=None) -> None:
-        if storage is None:
-            infer_string = get_option("future.infer_string")
-            if infer_string:
-                storage = "pyarrow_numpy"
+    def __init__(self, storage=None, na_value=None) -> None:
+        if not (
+            na_value is None or (isinstance(na_value, float) and np.isnan(na_value))
+        ):
+            raise ValueError(
+                "'na_value' must be the default value or pd.NA, got {na_value}"
+            )
+
+        # infer defaults
+        if storage is None and na_value is None:
+            if using_pyarrow_string_dtype():
+                storage = "pyarrow"
+                na_value = np.nan
             else:
                 storage = get_option("mode.string_storage")
-        if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
+                na_value = libmissing.NA
+        elif storage is None:
+            # in this case na_value is NaN
+            storage = get_option("mode.string_storage")
+        elif na_value is None:
+            na_value = np.nan if using_pyarrow_string_dtype() else libmissing.NA
+            if na_value is not libmissing.NA and storage == "python":
+                raise NotImplementedError(
+                    "'python' mode for na_value of NaN not yet implemented"
+                )
+
+        if storage == "pyarrow_numpy":
+            # TODO raise a deprecation warning
+            storage = "pyarrow"
+        if storage not in {"python", "pyarrow"}:
             raise ValueError(
-                f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
-                f"Got {storage} instead."
+                f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
             )
-        if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1:
+        if storage == "pyarrow" and pa_version_under10p1:
             raise ImportError(
                 "pyarrow>=10.0.1 is required for PyArrow backed StringArray."
             )
         self.storage = storage
+        self._na_value = na_value
 
     @property
     def type(self) -> type[str]:
@@ -176,11 +199,14 @@ def construct_from_string(cls, string) -> Self:
             )
         if string == "string":
             return cls()
+        elif string == "String":
+            return cls(na_value=np.nan)
         elif string == "string[python]":
-            return cls(storage="python")
+            return cls(storage="python", na_value=np.nan)
         elif string == "string[pyarrow]":
-            return cls(storage="pyarrow")
+            return cls(storage="pyarrow", na_value=np.nan)
         elif string == "string[pyarrow_numpy]":
+            # TODO deprecate
             return cls(storage="pyarrow_numpy")
         else:
             raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
@@ -205,10 +231,10 @@ def construct_array_type(  # type: ignore[override]
 
         if self.storage == "python":
             return StringArray
-        elif self.storage == "pyarrow":
-            return ArrowStringArray
-        else:
+        elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
             return ArrowStringArrayNumpySemantics
+        else:
+            return ArrowStringArray
 
     def __from_arrow__(
         self, array: pyarrow.Array | pyarrow.ChunkedArray
@@ -217,13 +243,16 @@ def __from_arrow__(
         Construct StringArray from pyarrow Array/ChunkedArray.
         """
         if self.storage == "pyarrow":
-            from pandas.core.arrays.string_arrow import ArrowStringArray
+            if self._na_value is libmissing.NA:
+                from pandas.core.arrays.string_arrow import (
+                    ArrowStringArrayNumpySemantics,
+                )
 
-            return ArrowStringArray(array)
-        elif self.storage == "pyarrow_numpy":
-            from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
+                return ArrowStringArrayNumpySemantics(array)
+            else:
+                from pandas.core.arrays.string_arrow import ArrowStringArray
 
-            return ArrowStringArrayNumpySemantics(array)
+                return ArrowStringArray(array)
         else:
             import pyarrow
 

From e29ca8de77aef7f4cb51deba1a3e365d819eeb09 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 26 Jul 2024 22:14:49 +0200
Subject: [PATCH 02/14] update init

---
 pandas/_libs/lib.pyx                  |  2 +-
 pandas/_testing/__init__.py           |  4 +--
 pandas/core/arrays/arrow/array.py     |  6 ++--
 pandas/core/arrays/string_.py         | 46 ++++++++++-----------------
 pandas/core/arrays/string_arrow.py    |  2 +-
 pandas/core/construction.py           |  4 +--
 pandas/core/dtypes/cast.py            |  2 +-
 pandas/core/internals/construction.py |  2 +-
 pandas/core/reshape/encoding.py       |  3 +-
 pandas/core/reshape/merge.py          |  3 +-
 pandas/core/tools/numeric.py          |  9 ++++--
 pandas/io/_util.py                    |  6 ++--
 12 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 2650d60eb3cef..0bb47541e5963 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2702,7 +2702,7 @@ def maybe_convert_objects(ndarray[object] objects,
         if using_string_dtype() and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(storage="pyarrow_numpy")
+            dtype = StringDtype(storage="pyarrow", na_value=np.nan)
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
index 1cd91ee5b120c..3aa53d4b07aa5 100644
--- a/pandas/_testing/__init__.py
+++ b/pandas/_testing/__init__.py
@@ -509,14 +509,14 @@ def shares_memory(left, right) -> bool:
     if (
         isinstance(left, ExtensionArray)
         and is_string_dtype(left.dtype)
-        and left.dtype.storage in ("pyarrow", "pyarrow_numpy")  # type: ignore[attr-defined]
+        and left.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
     ):
         # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
         left = cast("ArrowExtensionArray", left)
         if (
             isinstance(right, ExtensionArray)
             and is_string_dtype(right.dtype)
-            and right.dtype.storage in ("pyarrow", "pyarrow_numpy")  # type: ignore[attr-defined]
+            and right.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
         ):
             right = cast("ArrowExtensionArray", right)
             left_pa_data = left._pa_array
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 5da479760047f..a17056b51a014 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -575,10 +575,8 @@ def __getitem__(self, item: PositionalIndexer):
         if isinstance(item, np.ndarray):
             if not len(item):
                 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
-                if self._dtype.name == "string" and self._dtype.storage in (
-                    "pyarrow",
-                    "pyarrow_numpy",
-                ):
+                if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
+                    # TODO(infer_string) should this be large_string?
                     pa_dtype = pa.string()
                 else:
                     pa_dtype = self._dtype.pyarrow_dtype
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index edb6910b9fd5a..fed8943105fcf 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -11,7 +11,7 @@
 
 from pandas._config import (
     get_option,
-    using_pyarrow_string_dtype,
+    using_string_dtype,
 )
 
 from pandas._libs import (
@@ -84,7 +84,7 @@ class StringDtype(StorageExtensionDtype):
 
     Parameters
     ----------
-    storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
+    storage : {"python", "pyarrow"}, optional
         If not given, the value of ``pd.options.mode.string_storage``.
     na_value :
 
@@ -121,35 +121,24 @@ def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
 
     _metadata = ("storage",)
 
-    def __init__(self, storage=None, na_value=None) -> None:
+    def __init__(self, storage=None, na_value=libmissing.NA) -> None:
         if not (
-            na_value is None or (isinstance(na_value, float) and np.isnan(na_value))
+            na_value is libmissing.NA
+            or (isinstance(na_value, float) and np.isnan(na_value))
         ):
-            raise ValueError(
-                "'na_value' must be the default value or pd.NA, got {na_value}"
-            )
+            raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}")
 
         # infer defaults
-        if storage is None and na_value is None:
-            if using_pyarrow_string_dtype():
+        if storage is None:
+            if using_string_dtype():
                 storage = "pyarrow"
-                na_value = np.nan
             else:
                 storage = get_option("mode.string_storage")
-                na_value = libmissing.NA
-        elif storage is None:
-            # in this case na_value is NaN
-            storage = get_option("mode.string_storage")
-        elif na_value is None:
-            na_value = np.nan if using_pyarrow_string_dtype() else libmissing.NA
-            if na_value is not libmissing.NA and storage == "python":
-                raise NotImplementedError(
-                    "'python' mode for na_value of NaN not yet implemented"
-                )
 
         if storage == "pyarrow_numpy":
             # TODO raise a deprecation warning
             storage = "pyarrow"
+
         if storage not in {"python", "pyarrow"}:
             raise ValueError(
                 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
@@ -199,12 +188,10 @@ def construct_from_string(cls, string) -> Self:
             )
         if string == "string":
             return cls()
-        elif string == "String":
-            return cls(na_value=np.nan)
         elif string == "string[python]":
-            return cls(storage="python", na_value=np.nan)
+            return cls(storage="python")
         elif string == "string[pyarrow]":
-            return cls(storage="pyarrow", na_value=np.nan)
+            return cls(storage="pyarrow")
         elif string == "string[pyarrow_numpy]":
             # TODO deprecate
             return cls(storage="pyarrow_numpy")
@@ -232,9 +219,9 @@ def construct_array_type(  # type: ignore[override]
         if self.storage == "python":
             return StringArray
         elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
-            return ArrowStringArrayNumpySemantics
-        else:
             return ArrowStringArray
+        else:
+            return ArrowStringArrayNumpySemantics
 
     def __from_arrow__(
         self, array: pyarrow.Array | pyarrow.ChunkedArray
@@ -244,15 +231,16 @@ def __from_arrow__(
         """
         if self.storage == "pyarrow":
             if self._na_value is libmissing.NA:
+                from pandas.core.arrays.string_arrow import ArrowStringArray
+
+                return ArrowStringArray(array)
+            else:
                 from pandas.core.arrays.string_arrow import (
                     ArrowStringArrayNumpySemantics,
                 )
 
                 return ArrowStringArrayNumpySemantics(array)
-            else:
-                from pandas.core.arrays.string_arrow import ArrowStringArray
 
-                return ArrowStringArray(array)
         else:
             import pyarrow
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 97c06149d0b7e..d8c322fdbf27a 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -597,7 +597,7 @@ def _rank(
 
 
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
-    _storage = "pyarrow_numpy"
+    _storage = "pyarrow"
 
     @classmethod
     def _result_converter(cls, values, na=None):
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 32792aa7f0543..81aeb40f375b0 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -574,7 +574,7 @@ def sanitize_array(
         if isinstance(data, str) and using_string_dtype() and original_dtype is None:
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype("pyarrow_numpy")
+            dtype = StringDtype("pyarrow", na_value=np.nan)
         data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
 
         return data
@@ -608,7 +608,7 @@ def sanitize_array(
             elif data.dtype.kind == "U" and using_string_dtype():
                 from pandas.core.arrays.string_ import StringDtype
 
-                dtype = StringDtype(storage="pyarrow_numpy")
+                dtype = StringDtype(storage="pyarrow", na_value=np.nan)
                 subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
 
             if subarr is data and copy:
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 21e45505b40fc..d750451a1ca84 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
         if using_string_dtype():
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(storage="pyarrow_numpy")
+            dtype = StringDtype(storage="pyarrow", na_value=np.nan)
 
     elif isinstance(val, (np.datetime64, dt.datetime)):
         try:
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index c31479b3011e5..08e1650a5de12 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -302,7 +302,7 @@ def ndarray_to_mgr(
             nb = new_block_2d(values, placement=bp, refs=refs)
             block_values = [nb]
     elif dtype is None and values.dtype.kind == "U" and using_string_dtype():
-        dtype = StringDtype(storage="pyarrow_numpy")
+        dtype = StringDtype(storage="pyarrow", na_value=np.nan)
 
         obj_columns = list(values)
         block_values = [
diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py
index 9d88e61951e99..c397c1c2566a5 100644
--- a/pandas/core/reshape/encoding.py
+++ b/pandas/core/reshape/encoding.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 
+from pandas._libs import missing as libmissing
 from pandas._libs.sparse import IntIndex
 
 from pandas.core.dtypes.common import (
@@ -256,7 +257,7 @@ def _get_dummies_1d(
             dtype = ArrowDtype(pa.bool_())  # type: ignore[assignment]
         elif (
             isinstance(input_dtype, StringDtype)
-            and input_dtype.storage != "pyarrow_numpy"
+            and input_dtype.na_value is libmissing.NA
         ):
             dtype = pandas_dtype("boolean")  # type: ignore[assignment]
         else:
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 2ce77ac19b9c5..6364072fd215c 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -2677,8 +2677,7 @@ def _factorize_keys(
 
     elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
         if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
-            isinstance(lk.dtype, StringDtype)
-            and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"]
+            isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow"
         ):
             import pyarrow as pa
             import pyarrow.compute as pc
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
index 3d406d3bfb115..26e73794af298 100644
--- a/pandas/core/tools/numeric.py
+++ b/pandas/core/tools/numeric.py
@@ -7,7 +7,10 @@
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import (
+    lib,
+    missing as libmissing,
+)
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.cast import maybe_downcast_numeric
@@ -218,7 +221,7 @@ def to_numeric(
             coerce_numeric=coerce_numeric,
             convert_to_masked_nullable=dtype_backend is not lib.no_default
             or isinstance(values_dtype, StringDtype)
-            and not values_dtype.storage == "pyarrow_numpy",
+            and values_dtype.na_value is libmissing.NA,
         )
 
     if new_mask is not None:
@@ -229,7 +232,7 @@ def to_numeric(
         dtype_backend is not lib.no_default
         and new_mask is None
         or isinstance(values_dtype, StringDtype)
-        and not values_dtype.storage == "pyarrow_numpy"
+        and values_dtype.na_value is libmissing.NA
     ):
         new_mask = np.zeros(values.shape, dtype=np.bool_)
 
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
index cb0f89945e440..a72a16269959d 100644
--- a/pandas/io/_util.py
+++ b/pandas/io/_util.py
@@ -2,6 +2,8 @@
 
 from typing import TYPE_CHECKING
 
+import numpy as np
+
 from pandas.compat._optional import import_optional_dependency
 
 import pandas as pd
@@ -32,6 +34,6 @@ def arrow_string_types_mapper() -> Callable:
     pa = import_optional_dependency("pyarrow")
 
     return {
-        pa.string(): pd.StringDtype(storage="pyarrow_numpy"),
-        pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"),
+        pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan),
+        pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan),
     }.get

From cb7410f461ff3b99d3dbab339ff50fb6255e5308 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 27 Jul 2024 13:08:22 +0200
Subject: [PATCH 03/14] fix propagating na_value to Array class + fix some
 tests

---
 pandas/core/arrays/string_.py                 |  1 +
 pandas/core/arrays/string_arrow.py            |  4 +-
 pandas/tests/arrays/string_/test_string.py    | 53 ++++++++-----------
 .../tests/arrays/string_/test_string_arrow.py |  2 +-
 pandas/tests/extension/base/methods.py        |  8 +--
 pandas/tests/extension/test_string.py         |  6 +--
 6 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index fed8943105fcf..55977c2a2e93e 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -138,6 +138,7 @@ def __init__(self, storage=None, na_value=libmissing.NA) -> None:
         if storage == "pyarrow_numpy":
             # TODO raise a deprecation warning
             storage = "pyarrow"
+            na_value = np.nan
 
         if storage not in {"python", "pyarrow"}:
             raise ValueError(
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index d8c322fdbf27a..894dd1b0afc18 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -131,6 +131,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
     # base class "ArrowExtensionArray" defined the type as "ArrowDtype")
     _dtype: StringDtype  # type: ignore[assignment]
     _storage = "pyarrow"
+    _na_value = libmissing.NA
 
     def __init__(self, values) -> None:
         _chk_pyarrow_available()
@@ -140,7 +141,7 @@ def __init__(self, values) -> None:
             values = pc.cast(values, pa.large_string())
 
         super().__init__(values)
-        self._dtype = StringDtype(storage=self._storage)
+        self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
 
         if not pa.types.is_large_string(self._pa_array.type) and not (
             pa.types.is_dictionary(self._pa_array.type)
@@ -598,6 +599,7 @@ def _rank(
 
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
     _storage = "pyarrow"
+    _na_value = np.nan
 
     @classmethod
     def _result_converter(cls, values, na=None):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 597b407a29c94..ee648783c47be 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -20,13 +20,6 @@
 )
 
 
-def na_val(dtype):
-    if dtype.storage == "pyarrow_numpy":
-        return np.nan
-    else:
-        return pd.NA
-
-
 @pytest.fixture
 def dtype(string_storage):
     """Fixture giving StringDtype from parametrized 'string_storage'"""
@@ -41,22 +34,22 @@ def cls(dtype):
 
 def test_repr(dtype):
     df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)})
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         expected = "     A\n0    a\n1  NaN\n2    b"
     else:
         expected = "      A\n0     a\n1  <NA>\n2     b"
     assert repr(df) == expected
 
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         expected = "0      a\n1    NaN\n2      b\nName: A, dtype: string"
     else:
         expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
     assert repr(df.A) == expected
 
-    if dtype.storage == "pyarrow":
+    if dtype.storage == "pyarrow" and dtype.na_value is pd.NA:
         arr_name = "ArrowStringArray"
         expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
-    elif dtype.storage == "pyarrow_numpy":
+    elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
         arr_name = "ArrowStringArrayNumpySemantics"
         expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
     else:
@@ -68,7 +61,7 @@ def test_repr(dtype):
 def test_none_to_nan(cls, dtype):
     a = cls._from_sequence(["a", None, "b"], dtype=dtype)
     assert a[1] is not None
-    assert a[1] is na_val(a.dtype)
+    assert a[1] is a.dtype.na_value
 
 
 def test_setitem_validates(cls, dtype):
@@ -225,7 +218,7 @@ def test_comparison_methods_scalar(comparison_op, dtype):
     a = pd.array(["a", None, "c"], dtype=dtype)
     other = "a"
     result = getattr(a, op_name)(other)
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         expected = np.array([getattr(item, op_name)(other) for item in a])
         if comparison_op == operator.ne:
             expected[1] = True
@@ -244,7 +237,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
     a = pd.array(["a", None, "c"], dtype=dtype)
     result = getattr(a, op_name)(pd.NA)
 
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         if operator.ne == comparison_op:
             expected = np.array([True, True, True])
         else:
@@ -271,7 +264,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
 
     result = getattr(a, op_name)(other)
 
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         expected_data = {
             "__eq__": [False, False, False],
             "__ne__": [True, True, True],
@@ -293,7 +286,7 @@ def test_comparison_methods_array(comparison_op, dtype):
     a = pd.array(["a", None, "c"], dtype=dtype)
     other = [None, None, "c"]
     result = getattr(a, op_name)(other)
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         if operator.ne == comparison_op:
             expected = np.array([True, True, False])
         else:
@@ -387,7 +380,7 @@ def test_astype_int(dtype):
     tm.assert_numpy_array_equal(result, expected)
 
     arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         err = ValueError
         msg = "cannot convert float NaN to integer"
     else:
@@ -441,7 +434,7 @@ def test_min_max(method, skipna, dtype):
         expected = "a" if method == "min" else "c"
         assert result == expected
     else:
-        assert result is na_val(arr.dtype)
+        assert result is arr.dtype.na_value
 
 
 @pytest.mark.parametrize("method", ["min", "max"])
@@ -522,7 +515,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     expected = df.astype(f"string[{string_storage2}]")
     tm.assert_frame_equal(result, expected)
     # ensure the missing value is represented by NA and not np.nan or None
-    assert result.loc[2, "a"] is na_val(result["a"].dtype)
+    assert result.loc[2, "a"] is result["a"].dtype.na_value
 
 
 @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
@@ -556,10 +549,10 @@ def test_arrow_load_from_zero_chunks(
 
 
 def test_value_counts_na(dtype):
-    if getattr(dtype, "storage", "") == "pyarrow":
-        exp_dtype = "int64[pyarrow]"
-    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         exp_dtype = "int64"
+    elif dtype.storage == "pyarrow":
+        exp_dtype = "int64[pyarrow]"
     else:
         exp_dtype = "Int64"
     arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
@@ -573,10 +566,10 @@ def test_value_counts_na(dtype):
 
 
 def test_value_counts_with_normalize(dtype):
-    if getattr(dtype, "storage", "") == "pyarrow":
-        exp_dtype = "double[pyarrow]"
-    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         exp_dtype = np.float64
+    elif dtype.storage == "pyarrow":
+        exp_dtype = "double[pyarrow]"
     else:
         exp_dtype = "Float64"
     ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
@@ -586,10 +579,10 @@ def test_value_counts_with_normalize(dtype):
 
 
 def test_value_counts_sort_false(dtype):
-    if getattr(dtype, "storage", "") == "pyarrow":
-        exp_dtype = "int64[pyarrow]"
-    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         exp_dtype = "int64"
+    elif dtype.storage == "pyarrow":
+        exp_dtype = "int64[pyarrow]"
     else:
         exp_dtype = "Int64"
     ser = pd.Series(["a", "b", "c", "b"], dtype=dtype)
@@ -621,7 +614,7 @@ def test_astype_from_float_dtype(float_dtype, dtype):
 def test_to_numpy_returns_pdna_default(dtype):
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
     result = np.array(arr)
-    expected = np.array(["a", na_val(dtype), "b"], dtype=object)
+    expected = np.array(["a", dtype.na_value, "b"], dtype=object)
     tm.assert_numpy_array_equal(result, expected)
 
 
@@ -661,7 +654,7 @@ def test_setitem_scalar_with_mask_validation(dtype):
     mask = np.array([False, True, False])
 
     ser[mask] = None
-    assert ser.array[1] is na_val(ser.dtype)
+    assert ser.array[1] is ser.dtype.na_value
 
     # for other non-string we should also raise an error
     ser = pd.Series(["a", "b", "c"], dtype=dtype)
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index 405c1c217b04d..458a3f5ae77d4 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -260,6 +260,6 @@ def test_pickle_roundtrip(dtype):
 def test_string_dtype_error_message():
     # GH#55051
     pytest.importorskip("pyarrow")
-    msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'."
+    msg = "Storage must be 'python' or 'pyarrow'."
     with pytest.raises(ValueError, match=msg):
         StringDtype("bla")
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index b7f0f973e640a..dd2ed0bd62a02 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -66,14 +66,14 @@ def test_value_counts_with_normalize(self, data):
             expected = pd.Series(0.0, index=result.index, name="proportion")
             expected[result > 0] = 1 / len(values)
 
-        if getattr(data.dtype, "storage", "") == "pyarrow" or isinstance(
+        if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan:
+            # TODO: avoid special-casing
+            expected = expected.astype("float64")
+        elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance(
             data.dtype, pd.ArrowDtype
         ):
             # TODO: avoid special-casing
             expected = expected.astype("double[pyarrow]")
-        elif getattr(data.dtype, "storage", "") == "pyarrow_numpy":
-            # TODO: avoid special-casing
-            expected = expected.astype("float64")
         elif na_value_for_dtype(data.dtype) is pd.NA:
             # TODO(GH#44692): avoid special-casing
             expected = expected.astype("Float64")
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 49ad3fce92a5c..a44bbca07039b 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -190,7 +190,7 @@ def _get_expected_exception(
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
         return (
             op_name in ["min", "max"]
-            or ser.dtype.storage == "pyarrow_numpy"  # type: ignore[union-attr]
+            or ser.dtype.na_value is np.nan  # type: ignore[union-attr]
             and op_name in ("any", "all")
         )
 
@@ -198,10 +198,10 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
         dtype = cast(StringDtype, tm.get_dtype(obj))
         if op_name in ["__add__", "__radd__"]:
             cast_to = dtype
+        elif dtype.na_value is np.nan:
+            cast_to = np.bool_  # type: ignore[assignment]
         elif dtype.storage == "pyarrow":
             cast_to = "boolean[pyarrow]"  # type: ignore[assignment]
-        elif dtype.storage == "pyarrow_numpy":
-            cast_to = np.bool_  # type: ignore[assignment]
         else:
             cast_to = "boolean"  # type: ignore[assignment]
         return pointwise_result.astype(cast_to)

From ffa7eaddd3934413938708e3d578655225af8948 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 27 Jul 2024 15:32:53 +0200
Subject: [PATCH 04/14] fix more tests

---
 pandas/core/indexes/base.py      | 3 ++-
 pandas/tests/strings/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index e67c59c86dd0c..50f44cc728aea 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -5453,9 +5453,10 @@ def equals(self, other: Any) -> bool:
 
         if (
             isinstance(self.dtype, StringDtype)
-            and self.dtype.storage == "pyarrow_numpy"
+            and self.dtype.na_value is np.nan
             and other.dtype != self.dtype
         ):
+            # TODO(infer_string) can we avoid this special case?
             # special case for object behavior
             return other.equals(self.astype(object))
 
diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py
index 01b49b5e5b633..e94f656fc9823 100644
--- a/pandas/tests/strings/__init__.py
+++ b/pandas/tests/strings/__init__.py
@@ -7,7 +7,7 @@
 
 def _convert_na_value(ser, expected):
     if ser.dtype != object:
-        if ser.dtype.storage == "pyarrow_numpy":
+        if ser.dtype.na_value is np.nan:
             expected = expected.fillna(np.nan)
         else:
             # GH#18463

From a9c466bf83e5c1339bc0da68640df7da1a7141b4 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 27 Jul 2024 15:45:57 +0200
Subject: [PATCH 05/14] disallow pyarrow_numpy as option + fix more cases of
 checking storage to be pyarrow_numpy

---
 pandas/core/arrays/string_arrow.py            |  5 +---
 pandas/core/config_init.py                    |  2 +-
 pandas/tests/arrays/string_/test_string.py    |  8 ++++-
 .../tests/arrays/string_/test_string_arrow.py |  2 ++
 pandas/tests/extension/test_string.py         | 30 +++++++------------
 .../frame/methods/test_convert_dtypes.py      |  6 +++-
 6 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 894dd1b0afc18..e7acb7ca8af7a 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -188,10 +188,7 @@ def _from_sequence(
 
         if dtype and not (isinstance(dtype, str) and dtype == "string"):
             dtype = pandas_dtype(dtype)
-            assert isinstance(dtype, StringDtype) and dtype.storage in (
-                "pyarrow",
-                "pyarrow_numpy",
-            )
+            assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
 
         if isinstance(scalars, BaseMaskedArray):
             # avoid costly conversion to object dtype in ensure_string_array and
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 352020f45388f..073d1de11e0db 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -460,7 +460,7 @@ def is_terminal() -> bool:
         "string_storage",
         "python",
         string_storage_doc,
-        validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]),
+        validator=is_one_of_factory(["python", "pyarrow"]),
     )
 
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index ee648783c47be..a7f45c185481c 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -483,7 +483,7 @@ def test_arrow_array(dtype):
     data = pd.array(["a", "b", "c"], dtype=dtype)
     arr = pa.array(data)
     expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
-    if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
+    if dtype.storage == "pyarrow" and pa_version_under12p0:
         expected = pa.chunked_array(expected)
     if dtype.storage == "python":
         expected = pc.cast(expected, pa.string())
@@ -501,6 +501,10 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
                 reason="infer_string takes precedence over string storage"
             )
         )
+    if string_storage2 == "pyarrow_numpy":
+        # we cannot set "pyarrow_numpy" as storage option anymore, need to
+        # update the tests for this
+        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
 
     data = pd.array(["a", "b", None], dtype=dtype)
     df = pd.DataFrame({"a": data})
@@ -531,6 +535,8 @@ def test_arrow_load_from_zero_chunks(
                 reason="infer_string takes precedence over string storage"
             )
         )
+    if string_storage2 == "pyarrow_numpy":
+        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
 
     data = pd.array([], dtype=dtype)
     df = pd.DataFrame({"a": data})
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index 458a3f5ae77d4..c610ef5315723 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -29,6 +29,8 @@ def test_eq_all_na():
 def test_config(string_storage, request, using_infer_string):
     if using_infer_string and string_storage != "pyarrow_numpy":
         request.applymarker(pytest.mark.xfail(reason="infer string takes precedence"))
+    if string_storage == "pyarrow_numpy":
+        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
     with pd.option_context("string_storage", string_storage):
         assert StringDtype().storage == string_storage
         result = pd.array(["a", "b"])
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index a44bbca07039b..b4858f02ad411 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -140,28 +140,21 @@ def _get_expected_exception(
         self, op_name: str, obj, other
     ) -> type[Exception] | None:
         if op_name in ["__divmod__", "__rdivmod__"]:
-            if isinstance(obj, pd.Series) and cast(
-                StringDtype, tm.get_dtype(obj)
-            ).storage in [
-                "pyarrow",
-                "pyarrow_numpy",
-            ]:
+            if (
+                isinstance(obj, pd.Series)
+                and cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow"
+            ):
                 # TODO: re-raise as TypeError?
                 return NotImplementedError
-            elif isinstance(other, pd.Series) and cast(
-                StringDtype, tm.get_dtype(other)
-            ).storage in [
-                "pyarrow",
-                "pyarrow_numpy",
-            ]:
+            elif (
+                isinstance(other, pd.Series)
+                and cast(StringDtype, tm.get_dtype(other)).storage == "pyarrow"
+            ):
                 # TODO: re-raise as TypeError?
                 return NotImplementedError
             return TypeError
         elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]:
-            if cast(StringDtype, tm.get_dtype(obj)).storage in [
-                "pyarrow",
-                "pyarrow_numpy",
-            ]:
+            if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow":
                 return NotImplementedError
             return TypeError
         elif op_name in ["__mul__", "__rmul__"]:
@@ -175,10 +168,7 @@ def _get_expected_exception(
             "__sub__",
             "__rsub__",
         ]:
-            if cast(StringDtype, tm.get_dtype(obj)).storage in [
-                "pyarrow",
-                "pyarrow_numpy",
-            ]:
+            if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow":
                 import pyarrow as pa
 
                 # TODO: better to re-raise as TypeError?
diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py
index 521d2cb14ac6a..4d2d3747a5298 100644
--- a/pandas/tests/frame/methods/test_convert_dtypes.py
+++ b/pandas/tests/frame/methods/test_convert_dtypes.py
@@ -18,13 +18,17 @@ def test_convert_dtypes(
         # Just check that it works for DataFrame here
         if using_infer_string:
             string_storage = "pyarrow_numpy"
+
+        string_storage_option = string_storage
+        if string_storage == "pyarrow_numpy":
+            string_storage_option = "pyarrow"
         df = pd.DataFrame(
             {
                 "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
                 "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
             }
         )
-        with pd.option_context("string_storage", string_storage):
+        with pd.option_context("string_storage", string_storage_option):
             result = df.convert_dtypes(True, True, convert_integer, False)
         expected = pd.DataFrame(
             {

From 1fc2113e9627b96a447c4cc6265a65dc8acf221b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 27 Jul 2024 16:09:23 +0200
Subject: [PATCH 06/14] restore pyarrow_numpy as option for now

---
 pandas/core/config_init.py                 | 2 +-
 pandas/tests/arrays/string_/test_string.py | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 073d1de11e0db..352020f45388f 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -460,7 +460,7 @@ def is_terminal() -> bool:
         "string_storage",
         "python",
         string_storage_doc,
-        validator=is_one_of_factory(["python", "pyarrow"]),
+        validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]),
     )
 
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index a7f45c185481c..78adc10631a79 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -501,10 +501,6 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
                 reason="infer_string takes precedence over string storage"
             )
         )
-    if string_storage2 == "pyarrow_numpy":
-        # we cannot set "pyarrow_numpy" as storage option anymore, need to
-        # update the tests for this
-        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
 
     data = pd.array(["a", "b", None], dtype=dtype)
     df = pd.DataFrame({"a": data})
@@ -535,8 +531,6 @@ def test_arrow_load_from_zero_chunks(
                 reason="infer_string takes precedence over string storage"
             )
         )
-    if string_storage2 == "pyarrow_numpy":
-        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
 
     data = pd.array([], dtype=dtype)
     df = pd.DataFrame({"a": data})

From b347b942e35cda66bd4e751504533bed0b1ab591 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 27 Jul 2024 16:33:09 +0200
Subject: [PATCH 07/14] linting

---
 pandas/core/arrays/string_.py      | 3 ++-
 pandas/core/arrays/string_arrow.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 55977c2a2e93e..f4d9c8a6efde4 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -86,7 +86,8 @@ class StringDtype(StorageExtensionDtype):
     ----------
     storage : {"python", "pyarrow"}, optional
         If not given, the value of ``pd.options.mode.string_storage``.
-    na_value :
+    na_value : {np.nan, pd.NA}, default pd.NA
+        Whether the dtype follows NaN or NA missing value semantics.
 
     Attributes
     ----------
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index e7acb7ca8af7a..869cc34d5f61d 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -131,7 +131,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
     # base class "ArrowExtensionArray" defined the type as "ArrowDtype")
     _dtype: StringDtype  # type: ignore[assignment]
     _storage = "pyarrow"
-    _na_value = libmissing.NA
+    _na_value: libmissing.NAType | float = libmissing.NA
 
     def __init__(self, values) -> None:
         _chk_pyarrow_available()

From 80489fe1d01aa3b40447ae1c1e263653645eaa3b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 29 Jul 2024 15:36:29 +0200
Subject: [PATCH 08/14] try fix typing

---
 pandas/core/arrays/string_.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index f4d9c8a6efde4..0a731759243f9 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -122,7 +122,9 @@ def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
 
     _metadata = ("storage",)
 
-    def __init__(self, storage=None, na_value=libmissing.NA) -> None:
+    def __init__(
+        self, storage=None, na_value: libmissing.NAType | float = libmissing.NA
+    ) -> None:
         if not (
             na_value is libmissing.NA
             or (isinstance(na_value, float) and np.isnan(na_value))

From 8587297eb4955d0f3a97bdb8b34d920014e8493a Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 29 Jul 2024 15:37:03 +0200
Subject: [PATCH 09/14] try fix typing

---
 pandas/core/arrays/string_.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 0a731759243f9..29cd0308831c8 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -123,7 +123,9 @@ def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
     _metadata = ("storage",)
 
     def __init__(
-        self, storage=None, na_value: libmissing.NAType | float = libmissing.NA
+        self,
+        storage: str | None = None,
+        na_value: libmissing.NAType | float = libmissing.NA,
     ) -> None:
         if not (
             na_value is libmissing.NA

From a9650bb3ba4075f878e2dcb93aa552f91a224a07 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 29 Jul 2024 17:35:35 +0200
Subject: [PATCH 10/14] fix dtype equality to take into account the NaN vs NA

---
 pandas/core/arrays/string_.py              | 35 +++++++++++++++++-----
 pandas/tests/arrays/string_/test_string.py | 21 +++++++++++++
 pandas/tests/extension/test_string.py      |  8 ++++-
 3 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 29cd0308831c8..a14f1dd840668 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -120,19 +120,13 @@ class StringDtype(StorageExtensionDtype):
     def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
         return self._na_value
 
-    _metadata = ("storage",)
+    _metadata = ("storage", "na_value")
 
     def __init__(
         self,
         storage: str | None = None,
         na_value: libmissing.NAType | float = libmissing.NA,
     ) -> None:
-        if not (
-            na_value is libmissing.NA
-            or (isinstance(na_value, float) and np.isnan(na_value))
-        ):
-            raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}")
-
         # infer defaults
         if storage is None:
             if using_string_dtype():
@@ -145,6 +139,7 @@ def __init__(
             storage = "pyarrow"
             na_value = np.nan
 
+        # validate options
         if storage not in {"python", "pyarrow"}:
             raise ValueError(
                 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
@@ -153,9 +148,35 @@ def __init__(
             raise ImportError(
                 "pyarrow>=10.0.1 is required for PyArrow backed StringArray."
             )
+
+        if isinstance(na_value, float) and np.isnan(na_value):
+            # when passed a NaN value, always set to np.nan to ensure we use
+            # a consistent NaN value (and we can use `dtype.na_value is np.nan`)
+            na_value = np.nan
+        elif na_value is not libmissing.NA:
+            raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}")
+
         self.storage = storage
         self._na_value = na_value
 
+    def __eq__(self, other: object) -> bool:
+        # we need to override the base class __eq__ because na_value (NA or NaN)
+        # cannot be checked with normal `==`
+        if isinstance(other, str):
+            if other == self.name:
+                return True
+            try:
+                other = self.construct_from_string(other)
+            except TypeError:
+                return False
+        if isinstance(other, type(self)):
+            return self.storage == other.storage and self.na_value is other.na_value
+        return False
+
+    def __hash__(self) -> int:
+        # need to override __hash__ as well because of overriding __eq__
+        return super().__hash__()
+
     @property
     def type(self) -> type[str]:
         return str
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 78adc10631a79..7757847f3c841 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -32,6 +32,27 @@ def cls(dtype):
     return dtype.construct_array_type()
 
 
+def test_dtype_equality():
+    pytest.importorskip("pyarrow")
+
+    dtype1 = pd.StringDtype("python")
+    dtype2 = pd.StringDtype("pyarrow")
+    dtype3 = pd.StringDtype("pyarrow", na_value=np.nan)
+
+    assert dtype1 == pd.StringDtype("python", na_value=pd.NA)
+    assert dtype1 != dtype2
+    assert dtype1 != dtype3
+
+    assert dtype2 == pd.StringDtype("pyarrow", na_value=pd.NA)
+    assert dtype2 != dtype1
+    assert dtype2 != dtype3
+
+    assert dtype3 == pd.StringDtype("pyarrow", na_value=np.nan)
+    assert dtype3 == pd.StringDtype("pyarrow", na_value=float("nan"))
+    assert dtype3 != dtype1
+    assert dtype3 != dtype2
+
+
 def test_repr(dtype):
     df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)})
     if dtype.na_value is np.nan:
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index b4858f02ad411..4628c5568b49b 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -96,9 +96,15 @@ def data_for_grouping(dtype, chunked):
 
 class TestStringArray(base.ExtensionTests):
     def test_eq_with_str(self, dtype):
-        assert dtype == f"string[{dtype.storage}]"
         super().test_eq_with_str(dtype)
 
+        if dtype.na_value is pd.NA:
+            # only the NA-variant supports parametrized string alias
+            assert dtype == f"string[{dtype.storage}]"
+        elif dtype.storage == "pyarrow":
+            # TODO(infer_string) deprecate this
+            assert dtype == "string[pyarrow_numpy]"
+
     def test_is_not_string_type(self, dtype):
         # Different from BaseDtypeTests.test_is_not_string_type
         # because StringDtype is a string type

From 4136c9eca61dd6f93a47ffd1c460b5a6c24ed18a Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 29 Jul 2024 18:13:01 +0200
Subject: [PATCH 11/14] fix pickling of dtype

---
 pandas/core/arrays/string_.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index a14f1dd840668..d9a6036aa4da9 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -120,7 +120,7 @@ class StringDtype(StorageExtensionDtype):
     def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
         return self._na_value
 
-    _metadata = ("storage", "na_value")
+    _metadata = ("storage", "_na_value")
 
     def __init__(
         self,
@@ -177,6 +177,9 @@ def __hash__(self) -> int:
         # need to override __hash__ as well because of overriding __eq__
         return super().__hash__()
 
+    def __reduce__(self):
+        return StringDtype, (self.storage, self.na_value)
+
     @property
     def type(self) -> type[str]:
         return str

From c33e14af9fc7228249adc10fe3dcd15f1fd8a51a Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 29 Jul 2024 18:15:14 +0200
Subject: [PATCH 12/14] fix test_convert_dtypes

---
 pandas/tests/frame/methods/test_convert_dtypes.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py
index 4d2d3747a5298..9cbbebf35b2d1 100644
--- a/pandas/tests/frame/methods/test_convert_dtypes.py
+++ b/pandas/tests/frame/methods/test_convert_dtypes.py
@@ -19,16 +19,13 @@ def test_convert_dtypes(
         if using_infer_string:
             string_storage = "pyarrow_numpy"
 
-        string_storage_option = string_storage
-        if string_storage == "pyarrow_numpy":
-            string_storage_option = "pyarrow"
         df = pd.DataFrame(
             {
                 "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
                 "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
             }
         )
-        with pd.option_context("string_storage", string_storage_option):
+        with pd.option_context("string_storage", string_storage):
             result = df.convert_dtypes(True, True, convert_integer, False)
         expected = pd.DataFrame(
             {

From 151e3d110964a62652cfb6a0c0f9872ddb9c7ab4 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 29 Jul 2024 18:19:35 +0200
Subject: [PATCH 13/14] update expected result for dtype='string'

---
 pandas/tests/series/test_constructors.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 44a7862c21273..91cf1708ed43b 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -2113,9 +2113,12 @@ def test_series_string_inference_array_string_dtype(self):
         tm.assert_series_equal(ser, expected)
 
     def test_series_string_inference_storage_definition(self):
-        # GH#54793
+        # https://github.com/pandas-dev/pandas/issues/54793
+        # but after PDEP-14 (string dtype), it was decided to keep dtype="string"
+        # returning the NA string dtype, so expected is changed from
+        # "string[pyarrow_numpy]" to "string[pyarrow]"
         pytest.importorskip("pyarrow")
-        expected = Series(["a", "b"], dtype="string[pyarrow_numpy]")
+        expected = Series(["a", "b"], dtype="string[pyarrow]")
         with pd.option_context("future.infer_string", True):
             result = Series(["a", "b"], dtype="string")
         tm.assert_series_equal(result, expected)

From 899e3fc96912f82fbc65956d512f7146a12bfefa Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 29 Jul 2024 20:10:17 +0200
Subject: [PATCH 14/14] suppress typing error with _metadata attribute

---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index d9a6036aa4da9..cae770d85637c 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -120,7 +120,7 @@ class StringDtype(StorageExtensionDtype):
     def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
         return self._na_value
 
-    _metadata = ("storage", "_na_value")
+    _metadata = ("storage", "_na_value")  # type: ignore[assignment]
 
     def __init__(
         self,