pandas-dev · jorisvandenbossche · Aug 7, 2024 · Apr 27, 2024 · Apr 27, 2024 · Apr 27, 2024
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2702,7 +2702,7 @@ def maybe_convert_objects(ndarray[object] objects,
         if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(storage="pyarrow_numpy")
+            dtype = StringDtype()
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -790,6 +790,16 @@ def assert_extension_array_equal(
         left_na, right_na, obj=f"{obj} NA mask", index_values=index_values
     )
 
+    # Specifically for StringArrayNumpySemantics, validate here we have a valid array
+    if isinstance(left.dtype, StringDtype) and left.dtype.storage == "python_numpy":
+        assert np.all(
+            [np.isnan(val) for val in left._ndarray[left_na]]  # type: ignore[attr-defined]
+        ), "wrong missing value sentinels"
+    if isinstance(right.dtype, StringDtype) and right.dtype.storage == "python_numpy":
+        assert np.all(
+            [np.isnan(val) for val in right._ndarray[right_na]]  # type: ignore[attr-defined]
+        ), "wrong missing value sentinels"
+
     left_valid = left[~left_na].to_numpy(dtype=object)
     right_valid = right[~right_na].to_numpy(dtype=object)
     if check_exact:

diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -25,6 +25,7 @@
 )
 from pandas.compat.numpy import is_numpy_dev
 from pandas.compat.pyarrow import (
+    HAS_PYARROW,
     pa_version_under10p1,
     pa_version_under11p0,
     pa_version_under13p0,
@@ -156,6 +157,7 @@ def is_ci_environment() -> bool:
     "pa_version_under14p1",
     "pa_version_under16p0",
     "pa_version_under17p0",
+    "HAS_PYARROW",
     "IS64",
     "ISMUSL",
     "PY311",

diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py
@@ -17,6 +17,7 @@
     pa_version_under15p0 = _palv < Version("15.0.0")
     pa_version_under16p0 = _palv < Version("16.0.0")
     pa_version_under17p0 = _palv < Version("17.0.0")
+    HAS_PYARROW = True
 except ImportError:
     pa_version_under10p1 = True
     pa_version_under11p0 = True
@@ -27,3 +28,4 @@
     pa_version_under15p0 = True
     pa_version_under16p0 = True
     pa_version_under17p0 = True
+    HAS_PYARROW = False
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1295,6 +1295,7 @@ def nullable_string_dtype(request):
 @pytest.fixture(
     params=[
         "python",
+        "python_numpy",
         pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
         pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")),
     ]
@@ -1356,6 +1357,7 @@ def object_dtype(request):
     params=[
         "object",
         "string[python]",
+        "string[python_numpy]",
         pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
         pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
     ]

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -1,23 +1,31 @@
 from __future__ import annotations
 
+import operator
 from typing import (
     TYPE_CHECKING,
+    Any,
     ClassVar,
     Literal,
     cast,
 )
 
 import numpy as np
 
-from pandas._config import get_option
+from pandas._config import (
+    get_option,
+    using_pyarrow_string_dtype,
+)
 
 from pandas._libs import (
     lib,
     missing as libmissing,
 )
 from pandas._libs.arrays import NDArrayBacked
 from pandas._libs.lib import ensure_string_array
-from pandas.compat import pa_version_under10p1
+from pandas.compat import (
+    HAS_PYARROW,
+    pa_version_under10p1,
+)
 from pandas.compat.numpy import function as nv
 from pandas.util._decorators import doc
 
@@ -81,7 +89,7 @@ class StringDtype(StorageExtensionDtype):
 
     Parameters
     ----------
-    storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
+    storage : {"python", "pyarrow", "python_numpy", "pyarrow_numpy"}, optional
         If not given, the value of ``pd.options.mode.string_storage``.
 
     Attributes
@@ -113,7 +121,7 @@ class StringDtype(StorageExtensionDtype):
     # follows NumPy semantics, which uses nan.
     @property
     def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
-        if self.storage == "pyarrow_numpy":
+        if self.storage in ("pyarrow_numpy", "python_numpy"):
             return np.nan
         else:
             return libmissing.NA
@@ -122,15 +130,17 @@ def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
 
     def __init__(self, storage=None) -> None:
         if storage is None:
-            infer_string = get_option("future.infer_string")
-            if infer_string:
-                storage = "pyarrow_numpy"
+            if using_pyarrow_string_dtype():
+                if HAS_PYARROW:
+                    storage = "pyarrow_numpy"
+                else:
+                    storage = "python_numpy"
             else:
                 storage = get_option("mode.string_storage")
-        if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
+        if storage not in {"python", "pyarrow", "python_numpy", "pyarrow_numpy"}:
             raise ValueError(
-                f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
-                f"Got {storage} instead."
+                "Storage must be 'python', 'pyarrow', 'python_numpy' or 'pyarrow_numpy'"
+                f". Got {storage} instead."
             )
         if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1:
             raise ImportError(
@@ -178,6 +188,8 @@ def construct_from_string(cls, string) -> Self:
             return cls()
         elif string == "string[python]":
             return cls(storage="python")
+        elif string == "string[python_numpy]":
+            return cls(storage="python_numpy")
         elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
         elif string == "string[pyarrow_numpy]":
@@ -207,6 +219,8 @@ def construct_array_type(  # type: ignore[override]
             return StringArray
         elif self.storage == "pyarrow":
             return ArrowStringArray
+        elif self.storage == "python_numpy":
+            return StringArrayNumpySemantics
         else:
             return ArrowStringArrayNumpySemantics
 
@@ -238,7 +252,7 @@ def __from_arrow__(
                 # convert chunk by chunk to numpy and concatenate then, to avoid
                 # overflow for large string data when concatenating the pyarrow arrays
                 arr = arr.to_numpy(zero_copy_only=False)
-                arr = ensure_string_array(arr, na_value=libmissing.NA)
+                arr = ensure_string_array(arr, na_value=self.na_value)
                 results.append(arr)
 
         if len(chunks) == 0:
@@ -248,11 +262,7 @@ def __from_arrow__(
 
         # Bypass validation inside StringArray constructor, see GH#47781
         new_string_array = StringArray.__new__(StringArray)
-        NDArrayBacked.__init__(
-            new_string_array,
-            arr,
-            StringDtype(storage="python"),
-        )
+        NDArrayBacked.__init__(new_string_array, arr, self)
         return new_string_array
 
 
@@ -360,14 +370,15 @@ class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
 
     # undo the NumpyExtensionArray hack
     _typ = "extension"
+    _storage = "python"
 
     def __init__(self, values, copy: bool = False) -> None:
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
         if not isinstance(values, type(self)):
             self._validate()
-        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
+        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage))
 
     def _validate(self) -> None:
         """Validate that we only store NA or strings."""
@@ -385,22 +396,41 @@ def _validate(self) -> None:
         else:
             lib.convert_nans_to_NA(self._ndarray)
 
+    def _validate_scalar(self, value):
+        # used by NDArrayBackedExtensionIndex.insert
+        if isna(value):
+            return self.dtype.na_value
+        elif not isinstance(value, str):
+            raise TypeError(
+                f"Cannot set non-string value '{value}' into a string array."
+            )
+        return value
+
     @classmethod
     def _from_sequence(
         cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
     ) -> Self:
         if dtype and not (isinstance(dtype, str) and dtype == "string"):
             dtype = pandas_dtype(dtype)
-            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
+            assert isinstance(dtype, StringDtype) and dtype.storage in (
+                "python",
+                "python_numpy",
+            )
+        else:
+            if get_option("future.infer_string"):
+                dtype = StringDtype(storage="python_numpy")
+            else:
+                dtype = StringDtype(storage="python")
 
         from pandas.core.arrays.masked import BaseMaskedArray
 
+        na_value = dtype.na_value
         if isinstance(scalars, BaseMaskedArray):
             # avoid costly conversion to object dtype
             na_values = scalars._mask
             result = scalars._data
             result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
-            result[na_values] = libmissing.NA
+            result[na_values] = na_value
 
         else:
             if lib.is_pyarrow_array(scalars):
@@ -409,12 +439,12 @@ def _from_sequence(
                 #  zero_copy_only to True which caused problems see GH#52076
                 scalars = np.array(scalars)
             # convert non-na-likes to str, and nan-likes to StringDtype().na_value
-            result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
+            result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy)
 
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
         new_string_array = cls.__new__(cls)
-        NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
+        NDArrayBacked.__init__(new_string_array, result, dtype)
 
         return new_string_array
 
@@ -464,7 +494,7 @@ def __setitem__(self, key, value) -> None:
         # validate new items
         if scalar_value:
             if isna(value):
-                value = libmissing.NA
+                value = self.dtype.na_value
             elif not isinstance(value, str):
                 raise TypeError(
                     f"Cannot set non-string value '{value}' into a StringArray."
@@ -478,7 +508,7 @@ def __setitem__(self, key, value) -> None:
             mask = isna(value)
             if mask.any():
                 value = value.copy()
-                value[isna(value)] = libmissing.NA
+                value[isna(value)] = self.dtype.na_value
 
         super().__setitem__(key, value)
 
@@ -600,9 +630,9 @@ def _cmp_method(self, other, op):
 
         if op.__name__ in ops.ARITHMETIC_BINOPS:
             result = np.empty_like(self._ndarray, dtype="object")
-            result[mask] = libmissing.NA
+            result[mask] = self.dtype.na_value
             result[valid] = op(self._ndarray[valid], other)
-            return StringArray(result)
+            return self._from_backing_data(result)
         else:
             # logical
             result = np.zeros(len(self._ndarray), dtype="bool")
@@ -671,3 +701,106 @@ def _str_map(
             #    or .findall returns a list).
             # -> We don't know the result type. E.g. `.get` can return anything.
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
+
+
+class StringArrayNumpySemantics(StringArray):
+    _storage = "python_numpy"
+
+    def _validate(self) -> None:
+        """Validate that we only store NaN or strings."""
+        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
+            raise ValueError(
+                "StringArrayNumpySemantics requires a sequence of strings or NaN"
+            )
+        if self._ndarray.dtype != "object":
+            raise ValueError(
+                "StringArrayNumpySemantics requires a sequence of strings or NaN. Got "
+                f"'{self._ndarray.dtype}' dtype instead."
+            )
+        # TODO validate or force NA/None to NaN
+
+    @classmethod
+    def _from_sequence(
+        cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
+    ) -> Self:
+        if dtype is None:
+            dtype = StringDtype(storage="python_numpy")
+        return super()._from_sequence(scalars, dtype=dtype, copy=copy)
+
+    def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics:
+        # need to overrde NumpyExtensionArray._from_backing_data to ensure
+        # we always preserve the dtype
+        return NDArrayBacked._from_backing_data(self, arr)
+
+    def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
+        # the masked_reductions use pd.NA
+        if result is libmissing.NA:
+            return np.nan
+        return super()._wrap_reduction_result(axis, result)
+
+    def _cmp_method(self, other, op):
+        result = super()._cmp_method(other, op)
+        if op == operator.ne:
+            return result.to_numpy(np.bool_, na_value=True)
+        else:
+            return result.to_numpy(np.bool_, na_value=False)
+
+    def value_counts(self, dropna: bool = True) -> Series:
+        from pandas.core.algorithms import value_counts_internal as value_counts
+
+        result = value_counts(self._ndarray, sort=False, dropna=dropna)
+        result.index = result.index.astype(self.dtype)
+        return result
+
+    # ------------------------------------------------------------------------
+    # String methods interface
+    _str_na_value = np.nan
+
+    def _str_map(
+        self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
+    ):
+        if dtype is None:
+            dtype = self.dtype
+        if na_value is None:
+            na_value = self.dtype.na_value
+
+        mask = isna(self)
+        arr = np.asarray(self)
+        convert = convert and not np.all(mask)
+
+        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+            na_value_is_na = isna(na_value)
+            if na_value_is_na:
+                if is_integer_dtype(dtype):
+                    na_value = 0
+                else:
+                    na_value = True
+
+            result = lib.map_infer_mask(
+                arr,
+                f,
+                mask.view("uint8"),
+                convert=False,
+                na_value=na_value,
+                dtype=np.dtype(cast(type, dtype)),
+            )
+            if na_value_is_na and mask.any():
+                if is_integer_dtype(dtype):
+                    result = result.astype("float64")
+                else:
+                    result = result.astype("object")
+                result[mask] = np.nan
+            return result
+
+        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
+            # i.e. StringDtype
+            result = lib.map_infer_mask(
+                arr, f, mask.view("uint8"), convert=False, na_value=na_value
+            )
+            return type(self)(result)
+        else:
+            # This is when the result type is object. We reach this when
+            # -> We know the result type is truly object (e.g. .encode returns bytes
+            #    or .findall returns a list).
+            # -> We don't know the result type. E.g. `.get` can return anything.
+            return lib.map_infer_mask(arr, f, mask.view("uint8"))