WIP: preliminary support for stringdtype

ngoldbaum · ngoldbaum · commit 56ae25251f34 · 2023-05-15T10:11:14.000-06:00
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -41,23 +41,30 @@
     // pip (with all the conda available packages installed first,
     // followed by the pip installed packages).
     "matrix": {
-        "Cython": ["0.29.33"],
-        "matplotlib": [],
-        "sqlalchemy": [],
-        "scipy": [],
-        "numba": [],
-        "numexpr": [],
-        "pytables": [null, ""],  // platform dependent, see excludes below
-        "pyarrow": [],
-        "tables": [null, ""],
-        "openpyxl": [],
-        "xlsxwriter": [],
-        "xlrd": [],
-        "odfpy": [],
-        "jinja2": [],
-        "meson": [],
-        "meson-python": [],
-        "python-build": [],
+        "req": {
+            "pip+/home/nathan/Documents/numpy": [],
+            "Cython": ["0.29.33"],
+            "matplotlib": [],
+            "sqlalchemy": [],
+            "scipy": [],
+            "numba": [],
+            "numexpr": [],
+            "pytables": [null, ""],  // platform dependent, see excludes below
+            "pyarrow": [],
+            "tables": [null, ""],
+            "openpyxl": [],
+            "xlsxwriter": [],
+            "xlrd": [],
+            "odfpy": [],
+            "jinja2": [],
+            "meson": [],
+            "meson-python": [],
+            "python-build": [],
+            "pip+/home/nathan/Documents/numpy-user-dtypes/stringdtype": []
+        },
+        "env": {
+            "NUMPY_EXPERIMENTAL_DTYPE_API": "1"
+        }
     },
     "conda_channels": ["conda-forge"],
     // Combinations of libraries/python versions can be excluded/included
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -1,6 +1,7 @@
 import warnings
 
 import numpy as np
+from stringdtype import StringDType
 
 from pandas import (
     NA,
@@ -14,24 +15,38 @@
 
 
 class Dtypes:
-    params = ["str", "string[python]", "string[pyarrow]"]
+    params = ["str", "string[python]", "string[pyarrow]", StringDType()]
     param_names = ["dtype"]
+    dtype_mapping = {
+        "str": "str",
+        "string[python]": object,
+        "string[pyarrow]": object,
+        StringDType(): StringDType(),
+    }
 
     def setup(self, dtype):
         try:
-            self.s = Series(tm.makeStringIndex(10**5), dtype=dtype)
+            self.s = Series(
+                tm.makeStringIndex(10**5, dtype=self.dtype_mapping[dtype]),
+                dtype=dtype,
+            )
         except ImportError:
             raise NotImplementedError
 
 
 class Construction:
     params = (
         ["series", "frame", "categorical_series"],
-        ["str", "string[python]", "string[pyarrow]"],
+        ["str", "string[python]", "string[pyarrow]", StringDType()],
     )
     param_names = ["pd_type", "dtype"]
     pd_mapping = {"series": Series, "frame": DataFrame, "categorical_series": Series}
-    dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object}
+    dtype_mapping = {
+        "str": "str",
+        "string[python]": object,
+        "string[pyarrow]": object,
+        StringDType(): StringDType(),
+    }
 
     def setup(self, pd_type, dtype):
         series_arr = tm.rands_array(
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1391,9 +1391,9 @@ cdef object _try_infer_map(object dtype):
     return None
 
 
-def infer_dtype(value: object, skipna: bool = True) -> str:
+def infer_dtype(value: object, skipna: bool = True) -> object:
     """
-    Return a string label of the type of a scalar or list-like of values.
+    Return the type of a scalar or list-like of values.
 
     Parameters
     ----------
@@ -1403,7 +1403,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
 
     Returns
     -------
-    str
+    str or dtype object
         Describing the common type of the input data.
     Results can include:
 
@@ -1427,6 +1427,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
     - mixed
     - unknown-array
 
+    Returns a dtype object for non-legacy numpy dtypes
+
     Raises
     ------
     TypeError
@@ -1529,6 +1531,9 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
     if inferred is not None:
         # Anything other than object-dtype should return here.
         return inferred
+    elif not getattr(type(values.dtype), "_legacy", True):
+        if issubclass(values.dtype.type, str):
+            return values.dtype
 
     if values.descr.type_num != NPY_OBJECT:
         # i.e. values.dtype != np.object_
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -356,8 +356,8 @@ def getCols(k) -> str:
 
 
 # make index
-def makeStringIndex(k: int = 10, name=None) -> Index:
-    return Index(rands_array(nchars=10, size=k), name=name)
+def makeStringIndex(k: int = 10, name=None, dtype: NpDtype = "O") -> Index:
+    return Index(rands_array(nchars=10, size=k, dtype=dtype), name=name)
 
 
 def makeCategoricalIndex(
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -35,6 +35,7 @@
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_integer,
+    is_legacy_string_dtype,
 )
 from pandas.core.dtypes.generic import (
     ABCExtensionArray,
@@ -243,7 +244,7 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi
         # has incompatible type "Iterable[Any]"; expected "Sized"
         return construct_1d_object_array_from_listlike(values)  # type: ignore[arg-type]
 
-    if issubclass(result.dtype.type, str):
+    if is_legacy_string_dtype(result.dtype):
         result = np.asarray(values, dtype=object)
 
     if result.ndim == 2:
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -43,6 +43,7 @@
     maybe_promote,
 )
 from pandas.core.dtypes.common import (
+    is_legacy_string_dtype,
     is_list_like,
     is_object_dtype,
     pandas_dtype,
@@ -708,7 +709,7 @@ def _sanitize_str_dtypes(
 
     # This is to prevent mixed-type Series getting all casted to
     # NumPy string type, e.g. NaN --> '-1#IND'.
-    if issubclass(result.dtype.type, str):
+    if is_legacy_string_dtype(result.dtype):
         # GH#16605
         # If not empty convert the data to dtype
         # GH#19853: If data is a scalar, result has already the result
diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py
@@ -18,6 +18,7 @@
 from pandas.errors import IntCastingNaNError
 
 from pandas.core.dtypes.common import (
+    is_legacy_string_dtype,
     is_object_dtype,
     is_string_dtype,
     pandas_dtype,
@@ -89,7 +90,7 @@ def _astype_nansafe(
         res = arr.astype(dtype, copy=copy)
         return np.asarray(res)
 
-    if issubclass(dtype.type, str):
+    if issubclass(dtype.type, str) and is_legacy_string_dtype(dtype):
         shape = arr.shape
         if arr.ndim > 1:
             arr = arr.ravel()
@@ -183,7 +184,7 @@ def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> Arra
         values = _astype_nansafe(values, dtype, copy=copy)
 
     # in pandas we don't store numpy str dtypes, so convert to object
-    if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):
+    if isinstance(dtype, np.dtype) and is_legacy_string_dtype(values.dtype):
         values = np.array(values, dtype=object)
 
     return values
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -515,7 +515,7 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool:
     """
     Faster alternative to is_string_dtype, assumes we have a np.dtype object.
     """
-    return dtype == object or dtype.kind in "SU"
+    return dtype == object or dtype.kind in "SU" or issubclass(dtype.type, str)
 
 
 def is_string_dtype(arr_or_dtype) -> bool:
@@ -1662,6 +1662,44 @@ def is_all_strings(value: ArrayLike) -> bool:
     return dtype == "string"
 
 
+def is_legacy_string_dtype(arr_or_dtype, include_bytes=False) -> bool:
+    """Check if the dtype is a numpy legacy string dtype
+
+    Parameters
+    ----------
+    arr_or_dtype : array-like or dtype
+        The array-like or dtype to check
+
+    include_bytes : boolean
+        whether or not to include bytestring dtypes
+
+    Returns
+    -------
+    boolean
+        True for legacy numpy dtypes that represent python strings,
+        False otherwise. If include_bytes is True, also true for
+        legacy bytes dtypes.
+
+    """
+    if arr_or_dtype is None:
+        return False
+
+    dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype)
+
+    if not isinstance(dtype, np.dtype):
+        return False
+
+    # the _legacy attribute was added in Numpy 1.25. If the attribute isn't
+    # defined on the dtype class, Numpy isn't sufficiently new, so we have to be
+    # dealing with a legacy dtype.
+    is_legacy = getattr(type(dtype), "_legacy", True)
+    if not is_legacy:
+        return False
+    if include_bytes:
+        return issubclass(dtype.type, (str, bytes))
+    return issubclass(dtype.type, str)
+
+
 __all__ = [
     "classes",
     "DT64NS_DTYPE",
@@ -1696,6 +1734,7 @@ def is_all_strings(value: ArrayLike) -> bool:
     "is_interval",
     "is_interval_dtype",
     "is_iterator",
+    "is_legacy_string_dtype",
     "is_named_tuple",
     "is_nested_list_like",
     "is_number",
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -275,7 +275,6 @@ def _from_values_or_dtype(
         >>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
         CategoricalDtype(categories=['x', 'y'], ordered=False, categories_dtype=object)
         """
-
         if dtype is not None:
             # The dtype argument takes precedence over values.dtype (if any)
             if isinstance(dtype, str):
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -11,6 +11,7 @@
 )
 
 import numpy as np
+from stringdtype import StringDType
 
 from pandas._config import get_option
 
@@ -305,6 +306,11 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bo
 
     if dtype.kind in ("S", "U"):
         result = np.zeros(values.shape, dtype=bool)
+    elif type(dtype) is StringDType:
+        if inf_as_na:
+            result = ~np.isfinite(values)
+        else:
+            result = np.isnan(values)
     else:
         if values.ndim in {1, 2}:
             result = libmissing.isnaobj(values, inf_as_na=inf_as_na)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -21,6 +21,7 @@
 import warnings
 
 import numpy as np
+from stringdtype import StringDType
 
 from pandas._config import get_option
 
@@ -506,7 +507,7 @@ def __new__(
             if isinstance(data, ABCMultiIndex):
                 data = data._values
 
-            if data.dtype.kind not in "iufcbmM":
+            if data.dtype.kind not in "iufcbmM" and type(data.dtype) != StringDType:
                 # GH#11836 we need to avoid having numpy coerce
                 # things that look like ints/floats to ints unless
                 # they are actually ints, e.g. '0' and 0.0
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -57,6 +57,7 @@
 from pandas.core.dtypes.common import (
     ensure_platform_int,
     is_1d_only_ea_dtype,
+    is_legacy_string_dtype,
     is_list_like,
     is_string_dtype,
 )
@@ -2317,7 +2318,7 @@ def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
     if isinstance(values, np.ndarray):
         values = ensure_wrapped_if_datetimelike(values)
 
-        if issubclass(values.dtype.type, str):
+        if is_legacy_string_dtype(values.dtype):
             values = np.array(values, dtype=object)
 
     if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None:
@@ -2347,15 +2348,29 @@ def get_block_type(dtype: DtypeObj) -> type[Block]:
         # Note: need to be sure PandasArray is unwrapped before we get here
         return ExtensionBlock
 
-    # We use kind checks because it is much more performant
-    #  than is_foo_dtype
-    kind = dtype.kind
-    if kind in "Mm":
-        return DatetimeLikeBlock
-    elif kind in "fciub":
-        return NumericBlock
+    dtype_class = type(dtype)
+
+    # the _is_numeric attribute was added in Numpy 1.25, default to checking
+    # dtype.kind and finally use ObjectBlock if numpy isn't sufficiently new.
+    try:
+        is_numeric = dtype_class._is_numeric
+    except AttributeError:
+        # We use kind checks because it is much more performant
+        #  than is_foo_dtype
+        kind = dtype.kind
+        if kind in "Mm":
+            return DatetimeLikeBlock
+        elif kind in "fciub":
+            return NumericBlock
+        else:
+            return ObjectBlock
 
-    return ObjectBlock
+    if is_numeric:
+        return NumericBlock
+    else:
+        if is_legacy_string_dtype(dtype):
+            return ObjectBlock
+        return NumpyBlock
 
 
 def new_block_2d(
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -28,6 +28,7 @@
 from pandas.core.dtypes.common import (
     is_1d_only_ea_dtype,
     is_integer_dtype,
+    is_legacy_string_dtype,
     is_list_like,
     is_named_tuple,
     is_object_dtype,
@@ -330,7 +331,7 @@ def ndarray_to_mgr(
     _check_values_indices_shape_match(values, index, columns)
 
     if typ == "array":
-        if issubclass(values.dtype.type, str):
+        if is_legacy_string_dtype(values.dtype):
             values = np.array(values, dtype=object)
 
         if dtype is None and is_object_dtype(values.dtype):
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py