pandas-dev · mroeschke · Jul 29, 2024 · May 20, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2702,7 +2702,7 @@ def maybe_convert_objects(ndarray[object] objects,
         if using_string_dtype() and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(storage="pyarrow_numpy")
+            dtype = StringDtype(storage="pyarrow", na_value=np.nan)
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -509,14 +509,14 @@ def shares_memory(left, right) -> bool:
     if (
         isinstance(left, ExtensionArray)
         and is_string_dtype(left.dtype)
-        and left.dtype.storage in ("pyarrow", "pyarrow_numpy")  # type: ignore[attr-defined]
+        and left.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
     ):
         # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
         left = cast("ArrowExtensionArray", left)
         if (
             isinstance(right, ExtensionArray)
             and is_string_dtype(right.dtype)
-            and right.dtype.storage in ("pyarrow", "pyarrow_numpy")  # type: ignore[attr-defined]
+            and right.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
         ):
             right = cast("ArrowExtensionArray", right)
             left_pa_data = left._pa_array

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -575,10 +575,8 @@ def __getitem__(self, item: PositionalIndexer):
         if isinstance(item, np.ndarray):
             if not len(item):
                 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
-                if self._dtype.name == "string" and self._dtype.storage in (
-                    "pyarrow",
-                    "pyarrow_numpy",
-                ):
+                if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
+                    # TODO(infer_string) should this be large_string?
                     pa_dtype = pa.string()
                 else:
                     pa_dtype = self._dtype.pyarrow_dtype

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -9,7 +9,10 @@
 
 import numpy as np
 
-from pandas._config import get_option
+from pandas._config import (
+    get_option,
+    using_string_dtype,
+)
 
 from pandas._libs import (
     lib,
@@ -81,8 +84,10 @@ class StringDtype(StorageExtensionDtype):
 
     Parameters
     ----------
-    storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
+    storage : {"python", "pyarrow"}, optional
         If not given, the value of ``pd.options.mode.string_storage``.
+    na_value : {np.nan, pd.NA}, default pd.NA
+        Whether the dtype follows NaN or NA missing value semantics.
 
     Attributes
     ----------
@@ -113,30 +118,67 @@ class StringDtype(StorageExtensionDtype):
     # follows NumPy semantics, which uses nan.
     @property
     def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
-        if self.storage == "pyarrow_numpy":
-            return np.nan
-        else:
-            return libmissing.NA
+        return self._na_value
 
-    _metadata = ("storage",)
+    _metadata = ("storage", "_na_value")  # type: ignore[assignment]
 
-    def __init__(self, storage=None) -> None:
+    def __init__(
+        self,
+        storage: str | None = None,
+        na_value: libmissing.NAType | float = libmissing.NA,
+    ) -> None:
+        # infer defaults
         if storage is None:
-            infer_string = get_option("future.infer_string")
-            if infer_string:
-                storage = "pyarrow_numpy"
+            if using_string_dtype():
+                storage = "pyarrow"
             else:
                 storage = get_option("mode.string_storage")
-        if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
+
+        if storage == "pyarrow_numpy":
+            # TODO raise a deprecation warning
+            storage = "pyarrow"
+            na_value = np.nan
+
+        # validate options
+        if storage not in {"python", "pyarrow"}:
             raise ValueError(
-                f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
-                f"Got {storage} instead."
+                f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
             )
-        if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1:
+        if storage == "pyarrow" and pa_version_under10p1:
             raise ImportError(
                 "pyarrow>=10.0.1 is required for PyArrow backed StringArray."
             )
+
+        if isinstance(na_value, float) and np.isnan(na_value):
+            # when passed a NaN value, always set to np.nan to ensure we use
+            # a consistent NaN value (and we can use `dtype.na_value is np.nan`)
+            na_value = np.nan
+        elif na_value is not libmissing.NA:
+            raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}")
+
         self.storage = storage
+        self._na_value = na_value
+
+    def __eq__(self, other: object) -> bool:
+        # we need to override the base class __eq__ because na_value (NA or NaN)
+        # cannot be checked with normal `==`
+        if isinstance(other, str):
+            if other == self.name:
+                return True
+            try:
+                other = self.construct_from_string(other)
+            except TypeError:
+                return False
+        if isinstance(other, type(self)):
+            return self.storage == other.storage and self.na_value is other.na_value
+        return False
+
+    def __hash__(self) -> int:
+        # need to override __hash__ as well because of overriding __eq__
+        return super().__hash__()
+
+    def __reduce__(self):
+        return StringDtype, (self.storage, self.na_value)
 
     @property
     def type(self) -> type[str]:
@@ -181,6 +223,7 @@ def construct_from_string(cls, string) -> Self:
         elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
         elif string == "string[pyarrow_numpy]":
+            # TODO deprecate
             return cls(storage="pyarrow_numpy")
         else:
             raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
@@ -205,7 +248,7 @@ def construct_array_type(  # type: ignore[override]
 
         if self.storage == "python":
             return StringArray
-        elif self.storage == "pyarrow":
+        elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
             return ArrowStringArray
         else:
             return ArrowStringArrayNumpySemantics
@@ -217,13 +260,17 @@ def __from_arrow__(
         Construct StringArray from pyarrow Array/ChunkedArray.
         """
         if self.storage == "pyarrow":
-            from pandas.core.arrays.string_arrow import ArrowStringArray
+            if self._na_value is libmissing.NA:
+                from pandas.core.arrays.string_arrow import ArrowStringArray
+
+                return ArrowStringArray(array)
+            else:
+                from pandas.core.arrays.string_arrow import (
+                    ArrowStringArrayNumpySemantics,
+                )
 
-            return ArrowStringArray(array)
-        elif self.storage == "pyarrow_numpy":
-            from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
+                return ArrowStringArrayNumpySemantics(array)
 
-            return ArrowStringArrayNumpySemantics(array)
         else:
             import pyarrow
 

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -131,6 +131,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
     # base class "ArrowExtensionArray" defined the type as "ArrowDtype")
     _dtype: StringDtype  # type: ignore[assignment]
     _storage = "pyarrow"
+    _na_value: libmissing.NAType | float = libmissing.NA
 
     def __init__(self, values) -> None:
         _chk_pyarrow_available()
@@ -140,7 +141,7 @@ def __init__(self, values) -> None:
             values = pc.cast(values, pa.large_string())
 
         super().__init__(values)
-        self._dtype = StringDtype(storage=self._storage)
+        self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
 
         if not pa.types.is_large_string(self._pa_array.type) and not (
             pa.types.is_dictionary(self._pa_array.type)
@@ -187,10 +188,7 @@ def _from_sequence(
 
         if dtype and not (isinstance(dtype, str) and dtype == "string"):
             dtype = pandas_dtype(dtype)
-            assert isinstance(dtype, StringDtype) and dtype.storage in (
-                "pyarrow",
-                "pyarrow_numpy",
-            )
+            assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
 
         if isinstance(scalars, BaseMaskedArray):
             # avoid costly conversion to object dtype in ensure_string_array and
@@ -597,7 +595,8 @@ def _rank(
 
 
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
-    _storage = "pyarrow_numpy"
+    _storage = "pyarrow"
+    _na_value = np.nan
 
     @classmethod
     def _result_converter(cls, values, na=None):

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -574,7 +574,7 @@ def sanitize_array(
         if isinstance(data, str) and using_string_dtype() and original_dtype is None:
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype("pyarrow_numpy")
+            dtype = StringDtype("pyarrow", na_value=np.nan)
         data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
 
         return data
@@ -608,7 +608,7 @@ def sanitize_array(
             elif data.dtype.kind == "U" and using_string_dtype():
                 from pandas.core.arrays.string_ import StringDtype
 
-                dtype = StringDtype(storage="pyarrow_numpy")
+                dtype = StringDtype(storage="pyarrow", na_value=np.nan)
                 subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
 
             if subarr is data and copy:

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
         if using_string_dtype():
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(storage="pyarrow_numpy")
+            dtype = StringDtype(storage="pyarrow", na_value=np.nan)
 
     elif isinstance(val, (np.datetime64, dt.datetime)):
         try:

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -5453,9 +5453,10 @@ def equals(self, other: Any) -> bool:
 
         if (
             isinstance(self.dtype, StringDtype)
-            and self.dtype.storage == "pyarrow_numpy"
+            and self.dtype.na_value is np.nan
             and other.dtype != self.dtype
         ):
+            # TODO(infer_string) can we avoid this special case?
             # special case for object behavior
             return other.equals(self.astype(object))
 

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -302,7 +302,7 @@ def ndarray_to_mgr(
             nb = new_block_2d(values, placement=bp, refs=refs)
             block_values = [nb]
     elif dtype is None and values.dtype.kind == "U" and using_string_dtype():
-        dtype = StringDtype(storage="pyarrow_numpy")
+        dtype = StringDtype(storage="pyarrow", na_value=np.nan)
 
         obj_columns = list(values)
         block_values = [

diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 
+from pandas._libs import missing as libmissing
 from pandas._libs.sparse import IntIndex
 
 from pandas.core.dtypes.common import (
@@ -256,7 +257,7 @@ def _get_dummies_1d(
             dtype = ArrowDtype(pa.bool_())  # type: ignore[assignment]
         elif (
             isinstance(input_dtype, StringDtype)
-            and input_dtype.storage != "pyarrow_numpy"
+            and input_dtype.na_value is libmissing.NA
         ):
             dtype = pandas_dtype("boolean")  # type: ignore[assignment]
         else:

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -2677,8 +2677,7 @@ def _factorize_keys(
 
     elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
         if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
-            isinstance(lk.dtype, StringDtype)
-            and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"]
+            isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow"
         ):
             import pyarrow as pa
             import pyarrow.compute as pc

diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -7,7 +7,10 @@
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import (
+    lib,
+    missing as libmissing,
+)
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.cast import maybe_downcast_numeric
@@ -218,7 +221,7 @@ def to_numeric(
             coerce_numeric=coerce_numeric,
             convert_to_masked_nullable=dtype_backend is not lib.no_default
             or isinstance(values_dtype, StringDtype)
-            and not values_dtype.storage == "pyarrow_numpy",
+            and values_dtype.na_value is libmissing.NA,
         )
 
     if new_mask is not None:
@@ -229,7 +232,7 @@ def to_numeric(
         dtype_backend is not lib.no_default
         and new_mask is None
         or isinstance(values_dtype, StringDtype)
-        and not values_dtype.storage == "pyarrow_numpy"
+        and values_dtype.na_value is libmissing.NA
     ):
         new_mask = np.zeros(values.shape, dtype=np.bool_)
 

diff --git a/pandas/io/_util.py b/pandas/io/_util.py
@@ -2,6 +2,8 @@
 
 from typing import TYPE_CHECKING
 
+import numpy as np
+
 from pandas.compat._optional import import_optional_dependency
 
 import pandas as pd
@@ -32,6 +34,6 @@ def arrow_string_types_mapper() -> Callable:
     pa = import_optional_dependency("pyarrow")
 
     return {
-        pa.string(): pd.StringDtype(storage="pyarrow_numpy"),
-        pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"),
+        pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan),
+        pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan),
     }.get