add NumpyStringArray and string[numpy] dtype

ngoldbaum · ngoldbaum · commit 206d2f061d32 · 2023-05-15T10:11:14.000-06:00
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -15,12 +15,19 @@
 
 
 class Dtypes:
-    params = ["str", "string[python]", "string[pyarrow]", StringDType()]
+    params = [
+        "str",
+        "string[python]",
+        "string[pyarrow]",
+        "string[numpy]",
+        StringDType(),
+    ]
     param_names = ["dtype"]
     dtype_mapping = {
         "str": "str",
         "string[python]": object,
         "string[pyarrow]": object,
+        "string[numpy]": StringDType(),
         StringDType(): StringDType(),
     }
 
@@ -37,14 +44,15 @@ def setup(self, dtype):
 class Construction:
     params = (
         ["series", "frame", "categorical_series"],
-        ["str", "string[python]", "string[pyarrow]", StringDType()],
+        ["str", "string[python]", "string[pyarrow]", "string[numpy]", StringDType()],
     )
     param_names = ["pd_type", "dtype"]
     pd_mapping = {"series": Series, "frame": DataFrame, "categorical_series": Series}
     dtype_mapping = {
         "str": "str",
         "string[python]": object,
         "string[pyarrow]": object,
+        "string[numpy]": StringDType(),
         StringDType(): StringDType(),
     }
 
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1860,7 +1860,7 @@ cdef class StringValidator(Validator):
         return isinstance(value, str)
 
     cdef bint is_array_typed(self) except -1:
-        return issubclass(self.dtype.type, np.str_)
+        return issubclass(self.dtype.type, (np.str_, str))
 
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
@@ -17,7 +17,11 @@
     period_array,
 )
 from pandas.core.arrays.sparse import SparseArray
-from pandas.core.arrays.string_ import StringArray
+from pandas.core.arrays.string_ import (
+    NumpyStringArray,
+    ObjectStringArray,
+    StringArray,
+)
 from pandas.core.arrays.string_arrow import ArrowStringArray
 from pandas.core.arrays.timedeltas import TimedeltaArray
 
@@ -39,5 +43,7 @@
     "period_array",
     "SparseArray",
     "StringArray",
+    "ObjectStringArray",
+    "NumpyStringArray",
     "TimedeltaArray",
 ]
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -14,7 +14,10 @@
     missing as libmissing,
 )
 from pandas._libs.arrays import NDArrayBacked
-from pandas.compat import pa_version_under7p0
+from pandas.compat import (
+    is_numpy_dev,
+    pa_version_under7p0,
+)
 from pandas.compat.numpy import function as nv
 from pandas.util._decorators import doc
 
@@ -24,6 +27,7 @@
     register_extension_dtype,
 )
 from pandas.core.dtypes.common import (
+    get_string_dtype,
     is_array_like,
     is_bool_dtype,
     is_integer_dtype,
@@ -76,7 +80,7 @@ class StringDtype(StorageExtensionDtype):
 
     Parameters
     ----------
-    storage : {"python", "pyarrow"}, optional
+    storage : {"python", "pyarrow", "numpy"}, optional
         If not given, the value of ``pd.options.mode.string_storage``.
 
     Attributes
@@ -108,14 +112,17 @@ def na_value(self) -> libmissing.NAType:
     def __init__(self, storage=None) -> None:
         if storage is None:
             storage = get_option("mode.string_storage")
-        if storage not in {"python", "pyarrow"}:
+        if storage not in {"python", "pyarrow", "numpy"}:
             raise ValueError(
-                f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
+                "Storage must be 'python', 'pyarrow', or 'numpy'. "
+                "Got {storage} instead."
             )
         if storage == "pyarrow" and pa_version_under7p0:
             raise ImportError(
                 "pyarrow>=7.0.0 is required for PyArrow backed StringArray."
             )
+        if storage == "numpy" and not is_numpy_dev:
+            raise ImportError("NumPy backed string storage requires numpy dev")
         self.storage = storage
 
     @property
@@ -139,6 +146,7 @@ def construct_from_string(cls, string):
             ``'string'``               pd.options.mode.string_storage, default python
             ``'string[python]'``       python
             ``'string[pyarrow]'``      pyarrow
+            ``'string[numpy]'``        numpy
             ========================== ==============================================
 
         Returns
@@ -160,6 +168,8 @@ def construct_from_string(cls, string):
             return cls(storage="python")
         elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
+        elif string == "string[numpy]":
+            return cls(storage="numpy")
         else:
             raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
 
@@ -179,9 +189,13 @@ def construct_array_type(  # type: ignore[override]
         from pandas.core.arrays.string_arrow import ArrowStringArray
 
         if self.storage == "python":
-            return StringArray
-        else:
+            return ObjectStringArray
+        elif self.storage == "pyarrow":
             return ArrowStringArray
+        elif self.storage == "numpy":
+            return NumpyStringArray
+        else:
+            raise NotImplementedError
 
     def __from_arrow__(
         self, array: pyarrow.Array | pyarrow.ChunkedArray
@@ -231,7 +245,7 @@ def tolist(self):
 
 # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
 # incompatible with definition in base class "ExtensionArray"
-class StringArray(BaseStringArray, PandasArray):  # type: ignore[misc]
+class BaseNumpyStringArray(BaseStringArray, PandasArray):  # type: ignore[misc]
     """
     Extension array for string data.
 
@@ -321,54 +335,23 @@ def __init__(self, values, copy: bool = False) -> None:
         super().__init__(values, copy=copy)
         if not isinstance(values, type(self)):
             self._validate()
-        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
+        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage))
 
     def _validate(self):
         """Validate that we only store NA or strings."""
         if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if self._ndarray.dtype != "object":
+        if self._ndarray.dtype != self._cache_dtype:
             raise ValueError(
-                "StringArray requires a sequence of strings or pandas.NA. Got "
+                f"{type(self).__name__} requires a sequence of strings or "
+                "pandas.NA convertible to a NumPy array with dtype "
+                f"{self._cache_dtype}. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
-        # Check to see if need to convert Na values to pd.NA
-        if self._ndarray.ndim > 2:
-            # Ravel if ndims > 2 b/c no cythonized version available
-            lib.convert_nans_to_NA(self._ndarray.ravel("K"))
-        else:
-            lib.convert_nans_to_NA(self._ndarray)
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
-        if dtype and not (isinstance(dtype, str) and dtype == "string"):
-            dtype = pandas_dtype(dtype)
-            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
-
-        from pandas.core.arrays.masked import BaseMaskedArray
-
-        if isinstance(scalars, BaseMaskedArray):
-            # avoid costly conversion to object dtype
-            na_values = scalars._mask
-            result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
-            result[na_values] = libmissing.NA
-
-        else:
-            if hasattr(scalars, "type"):
-                # pyarrow array; we cannot rely on the "to_numpy" check in
-                #  ensure_string_array because calling scalars.to_numpy would set
-                #  zero_copy_only to True which caused problems see GH#52076
-                scalars = np.array(scalars)
-            # convert non-na-likes to str, and nan-likes to StringDtype().na_value
-            result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
-
-        # Manually creating new array avoids the validation step in the __init__, so is
-        # faster. Refactor need for validation?
-        new_string_array = cls.__new__(cls)
-        NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
-
-        return new_string_array
+        raise NotImplementedError("_from_sequence must be implemented in subclasses")
 
     @classmethod
     def _from_sequence_of_strings(
@@ -612,3 +595,71 @@ def _str_map(
             #    or .findall returns a list).
             # -> We don't know the result type. E.g. `.get` can return anything.
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
+
+
+class ObjectStringArray(BaseNumpyStringArray):
+    _cache_dtype = "object"
+    _storage = "python"
+
+    def _validate(self):
+        super()._validate()
+        # Check to see if need to convert Na values to pd.NA
+        if self._ndarray.ndim > 2:
+            # Ravel if ndims > 2 b/c no cythonized version available
+            lib.convert_nans_to_NA(self._ndarray.ravel("K"))
+        else:
+            lib.convert_nans_to_NA(self._ndarray)
+
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
+        if dtype and not (isinstance(dtype, str) and dtype == "string"):
+            dtype = pandas_dtype(dtype)
+            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
+
+        from pandas.core.arrays.masked import BaseMaskedArray
+
+        if isinstance(scalars, BaseMaskedArray):
+            # avoid costly conversion to object dtype
+            na_values = scalars._mask
+            result = scalars._data
+            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            result[na_values] = libmissing.NA
+
+        else:
+            if hasattr(scalars, "type"):
+                # pyarrow array; we cannot rely on the "to_numpy" check in
+                #  ensure_string_array because calling scalars.to_numpy would set
+                #  zero_copy_only to True which caused problems see GH#52076
+                scalars = np.array(scalars)
+            # convert non-na-likes to str, and nan-likes to StringDtype().na_value
+            result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
+
+        # Manually creating new array avoids the validation step in the __init__, so is
+        # faster. Refactor need for validation?
+        new_string_array = cls.__new__(cls)
+        NDArrayBacked.__init__(
+            new_string_array, result, StringDtype(storage=cls._storage)
+        )
+
+        return new_string_array
+
+
+StringArray = ObjectStringArray
+
+
+class NumpyStringArray(BaseNumpyStringArray):
+    _cache_dtype = get_string_dtype()
+    _storage = "numpy"
+
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
+        result = np.array(scalars, dtype=cls._cache_dtype)
+
+        # Manually creating new array avoids the validation step in the __init__, so is
+        # faster. Refactor need for validation?
+        new_string_array = cls.__new__(cls)
+        NDArrayBacked.__init__(
+            new_string_array, result, StringDtype(storage=cls._storage)
+        )
+
+        return new_string_array
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -536,6 +536,7 @@ def sanitize_array(
     -------
     np.ndarray or ExtensionArray
     """
+
     if isinstance(data, ma.MaskedArray):
         data = sanitize_masked_array(data)
 
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -17,6 +17,7 @@
     Period,
     algos,
     lib,
+    missing,
 )
 from pandas._libs.tslibs import conversion
 from pandas.util._exceptions import find_stack_level
@@ -518,6 +519,18 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool:
     return dtype == object or dtype.kind in "SU" or issubclass(dtype.type, str)
 
 
+def get_string_dtype():
+    import os
+    import sys
+
+    if not os.environ.get("NUMPY_EXPERIMENTAL_DTYPE_API", None) == "1":
+        sys.exit()
+
+    import stringdtype
+
+    return stringdtype.StringDType(na_object=missing.NA)
+
+
 def is_string_dtype(arr_or_dtype) -> bool:
     """
     Check whether the provided array or dtype is of the string dtype.
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -11,7 +11,6 @@
 )
 
 import numpy as np
-from stringdtype import StringDType
 
 from pandas._config import get_option
 
@@ -26,6 +25,7 @@
     DT64NS_DTYPE,
     TD64NS_DTYPE,
     ensure_object,
+    get_string_dtype,
     is_scalar,
     is_string_or_object_np_dtype,
 )
@@ -300,6 +300,9 @@ def _isna_array(values: ArrayLike, inf_as_na: bool = False):
     return result
 
 
+StringDType = type(get_string_dtype())
+
+
 def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bool_]:
     # Working around NumPy ticket 1542
     dtype = values.dtype
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -21,7 +21,6 @@
 import warnings
 
 import numpy as np
-from stringdtype import StringDType
 
 from pandas._config import get_option
 
@@ -507,7 +506,7 @@ def __new__(
             if isinstance(data, ABCMultiIndex):
                 data = data._values
 
-            if data.dtype.kind not in "iufcbmM" and type(data.dtype) != StringDType:
+            if data.dtype.kind not in "iufcbmM":
                 # GH#11836 we need to avoid having numpy coerce
                 # things that look like ints/floats to ints unless
                 # they are actually ints, e.g. '0' and 0.0
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
@@ -14,7 +14,6 @@
 import unicodedata
 
 import numpy as np
-from stringdtype import StringDType
 
 from pandas._libs import lib
 import pandas._libs.missing as libmissing
@@ -82,10 +81,7 @@ def _str_map(
 
         arr = np.asarray(self)
         mask = isna(arr)
-        type(arr.dtype)
-        map_convert = (
-            convert and not np.all(mask) and type(arr.dtype) is not StringDType
-        )
+        map_convert = convert and not np.all(mask)
         try:
             result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)
         except (TypeError, AttributeError) as err: