pandas-dev · jorisvandenbossche · Jun 8, 2021 · Jul 10, 2020 · Sep 3, 2020 · Feb 18, 2021
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -412,6 +412,8 @@ def __ne__(self, other: Any) -> ArrayLike:
         """
         Return for `self != other` (element-wise in-equality).
         """
+        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndex)):
+            return NotImplemented
         return ~(self == other)
 
     def to_numpy(
@@ -516,7 +518,6 @@ def astype(self, dtype, copy=True):
             NumPy ndarray with 'dtype' for its dtype.
         """
         from pandas.core.arrays.string_ import StringDtype
-        from pandas.core.arrays.string_arrow import ArrowStringDtype
 
         dtype = pandas_dtype(dtype)
         if is_dtype_equal(dtype, self.dtype):
@@ -526,9 +527,7 @@ def astype(self, dtype, copy=True):
                 return self.copy()
 
         # FIXME: Really hard-code here?
-        if isinstance(
-            dtype, (ArrowStringDtype, StringDtype)
-        ):  # allow conversion to StringArrays
+        if isinstance(dtype, StringDtype):  # allow conversion to StringArrays
             return dtype.construct_array_type()._from_sequence(self, copy=False)
 
         return np.array(self, dtype=dtype, copy=copy)
@@ -1038,9 +1037,9 @@ def take(
               from the right (the default). This is similar to
               :func:`numpy.take`.
 
-            * True: negative values in `indices` indicate
-              missing values. These values are set to `fill_value`. Any other
-              other negative values raise a ``ValueError``.
+            * True: ``-1`` in `indices` indicate missing values.
+              These values are set to `fill_value`. Any other other negative
+              value raise a ``ValueError``.
 
         fill_value : any, optional
             Fill value to use for NA-indices when `allow_fill` is True.

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -2,13 +2,16 @@
 
 from typing import (
     TYPE_CHECKING,
+    Any,
     Optional,
     Type,
     Union,
 )
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import (
     lib,
     missing as libmissing,
@@ -49,6 +52,8 @@
 if TYPE_CHECKING:
     import pyarrow
 
+    from pandas.core.arrays.string_arrow import ArrowStringArray
+
 
 @register_extension_dtype
 class StringDtype(ExtensionDtype):
@@ -79,37 +84,114 @@ class StringDtype(ExtensionDtype):
     StringDtype
     """
 
-    name = "string"
-
     #: StringDtype.na_value uses pandas.NA
     na_value = libmissing.NA
+    _metadata = ("storage",)
+
+    def __init__(self, storage=None):
+        if storage is None:
+            storage = get_option("mode.string_storage")
+        if storage not in {"python", "pyarrow"}:
+            raise ValueError(
+                f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
+            )
+        self.storage = storage
+
+    @property
+    def name(self):
+        return f"StringDtype[{self.storage}]"
 
     @property
     def type(self) -> Type[str]:
         return str
 
     @classmethod
-    def construct_array_type(cls) -> Type[StringArray]:
+    def construct_from_string(cls, string):
+        """
+        Construct a StringDtype from a string.
+
+        Parameters
+        ----------
+        string : str
+            The type of the name. The storage type will be taking from `string`.
+            Valid options and their storage types are
+
+            ========================== ==============
+            string                     result storage
+            ========================== ==============
+            ``'string'``               global default
+            ``'string[python]'``       python
+            ``'StringDtype[python]'``  python
+            ``'string[pyarrow]'``      pyarrow
+            ``'StringDtype[pyarrow]'`` pyarrow
+            ========================== =============
+
+        Returns
+        -------
+        StringDtype
+
+        Raise
+        -----
+        TypeError
+            If the string is not a valid option.
+
+        """
+        if not isinstance(string, str):
+            raise TypeError(
+                f"'construct_from_string' expects a string, got {type(string)}"
+            )
+        if string == "string":
+            # TODO: use global default
+            return cls()
+        elif string in {"string[python]", "StringDtype[python]"}:
+            return cls(storage="python")
+        elif string in {"string[pyarrow]", "StringDtype[pyarrow]"}:
+            return cls(storage="pyarrow")
+        else:
+            raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
+
+    def __eq__(self, other: Any) -> bool:
+        if isinstance(other, str) and other == "string":
+            return True
+        return super().__eq__(other)
+
+    def __hash__(self) -> int:
+        # custom __eq__ so have to override __hash__
+        return super().__hash__()
+
+    # TODO: this is a classmethod, but we need to know the storage type.
+    # error: Signature of "construct_array_type" incompatible with supertype
+    # "ExtensionDtype"
+    def construct_array_type(  # type: ignore[override]
+        self,
+    ) -> Type[StringArray | ArrowStringArray]:
         """
         Return the array type associated with this dtype.
 
         Returns
         -------
         type
         """
-        return StringArray
+        from pandas.core.arrays.string_arrow import ArrowStringArray
 
-    def __repr__(self) -> str:
-        return "StringDtype"
+        if self.storage == "python":
+            return StringArray
+        else:
+            return ArrowStringArray
+
+    def __repr__(self):
+        return self.name
 
     def __from_arrow__(
         self, array: Union[pyarrow.Array, pyarrow.ChunkedArray]
-    ) -> StringArray:
+    ) -> ArrowStringArray:
         """
         Construct StringArray from pyarrow Array/ChunkedArray.
         """
         import pyarrow
 
+        from pandas.core.arrays.string_arrow import ArrowStringArray
+
         if isinstance(array, pyarrow.Array):
             chunks = [array]
         else:
@@ -122,7 +204,7 @@ def __from_arrow__(
             str_arr = StringArray._from_sequence(np.array(arr))
             results.append(str_arr)
 
-        return StringArray._concat_same_type(results)
+        return ArrowStringArray._concat_same_type(results)
 
 
 class StringArray(PandasArray):

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -6,24 +6,20 @@
     Any,
     Optional,
     Sequence,
-    Type,
     Union,
 )
 
 import numpy as np
+import pyarrow as pa
+import pyarrow.compute as pc
 
-from pandas._libs import (
-    lib,
-    missing as libmissing,
-)
+from pandas._libs import lib
 from pandas._typing import (
     Dtype,
     NpDtype,
 )
 from pandas.util._validators import validate_fillna_kwargs
 
-from pandas.core.dtypes.base import ExtensionDtype
-from pandas.core.dtypes.dtypes import register_extension_dtype
 from pandas.core.dtypes.missing import isna
 
 from pandas.api.types import (
@@ -35,121 +31,27 @@
 )
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.indexers import (
     check_array_indexer,
     validate_indices,
 )
 from pandas.core.missing import get_fill_func
 
-try:
-    import pyarrow as pa
-except ImportError:
-    pa = None
-else:
-    # PyArrow backed StringArrays are available starting at 1.0.0, but this
-    # file is imported from even if pyarrow is < 1.0.0, before pyarrow.compute
-    # and its compute functions existed. GH38801
-    if LooseVersion(pa.__version__) >= "1.0.0":
-        import pyarrow.compute as pc
-
-        ARROW_CMP_FUNCS = {
-            "eq": pc.equal,
-            "ne": pc.not_equal,
-            "lt": pc.less,
-            "gt": pc.greater,
-            "le": pc.less_equal,
-            "ge": pc.greater_equal,
-        }
+ARROW_CMP_FUNCS = {
+    "eq": pc.equal,
+    "ne": pc.not_equal,
+    "lt": pc.less,
+    "gt": pc.greater,
+    "le": pc.less_equal,
+    "ge": pc.greater_equal,
+}
 
 
 if TYPE_CHECKING:
     from pandas import Series
 
 
-@register_extension_dtype
-class ArrowStringDtype(ExtensionDtype):
-    """
-    Extension dtype for string data in a ``pyarrow.ChunkedArray``.
-
-    .. versionadded:: 1.2.0
-
-    .. warning::
-
-       ArrowStringDtype is considered experimental. The implementation and
-       parts of the API may change without warning.
-
-    Attributes
-    ----------
-    None
-
-    Methods
-    -------
-    None
-
-    Examples
-    --------
-    >>> from pandas.core.arrays.string_arrow import ArrowStringDtype
-    >>> ArrowStringDtype()
-    ArrowStringDtype
-    """
-
-    name = "arrow_string"
-
-    #: StringDtype.na_value uses pandas.NA
-    na_value = libmissing.NA
-
-    @property
-    def type(self) -> Type[str]:
-        return str
-
-    @classmethod
-    def construct_array_type(cls) -> Type[ArrowStringArray]:
-        """
-        Return the array type associated with this dtype.
-
-        Returns
-        -------
-        type
-        """
-        return ArrowStringArray
-
-    def __hash__(self) -> int:
-        return hash("ArrowStringDtype")
-
-    def __repr__(self) -> str:
-        return "ArrowStringDtype"
-
-    def __from_arrow__(
-        self, array: Union[pa.Array, pa.ChunkedArray]
-    ) -> ArrowStringArray:
-        """
-        Construct StringArray from pyarrow Array/ChunkedArray.
-        """
-        return ArrowStringArray(array)
-
-    def __eq__(self, other) -> bool:
-        """Check whether 'other' is equal to self.
-
-        By default, 'other' is considered equal if
-        * it's a string matching 'self.name'.
-        * it's an instance of this type.
-
-        Parameters
-        ----------
-        other : Any
-
-        Returns
-        -------
-        bool
-        """
-        if isinstance(other, ArrowStringDtype):
-            return True
-        elif isinstance(other, str) and other == "arrow_string":
-            return True
-        else:
-            return False
-
-
 class ArrowStringArray(OpsMixin, ExtensionArray):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
@@ -188,13 +90,13 @@ class ArrowStringArray(OpsMixin, ExtensionArray):
 
     Examples
     --------
-    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string")
+    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[arrow]")
     <ArrowStringArray>
     ['This is', 'some text', <NA>, 'data.']
     Length: 4, dtype: arrow_string
     """
 
-    _dtype = ArrowStringDtype()
+    _dtype = StringDtype(storage="pyarrow")
 
     def __init__(self, values):
         self._chk_pyarrow_available()
@@ -232,9 +134,9 @@ def _from_sequence_of_strings(
         return cls._from_sequence(strings, dtype=dtype, copy=copy)
 
     @property
-    def dtype(self) -> ArrowStringDtype:
+    def dtype(self) -> StringDtype:
         """
-        An instance of 'ArrowStringDtype'.
+        An instance of 'StringDtype[pyarrow]'.
         """
         return self._dtype