From 4cb60e6b586338c468e04a4274a05c06811adeb7 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Fri, 10 Jul 2020 20:19:15 +0200
Subject: [PATCH 01/46] Implement BaseDtypeTests for ArrowStringDtype

---
 pandas/core/arrays/base.py                  |   6 +-
 pandas/core/arrays/string_arrow.py          | 484 ++++++++++++++++++++
 pandas/tests/extension/test_string_arrow.py | 125 +++++
 setup.py                                    |   2 +-
 4 files changed, 615 insertions(+), 2 deletions(-)
 create mode 100644 pandas/core/arrays/string_arrow.py
 create mode 100644 pandas/tests/extension/test_string_arrow.py

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 8193d65b3b30c..736d95b4b64b6 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -457,9 +457,13 @@ def astype(self, dtype, copy=True):
             NumPy ndarray with 'dtype' for its dtype.
         """
         from pandas.core.arrays.string_ import StringDtype
+        from pandas.core.arrays.string_arrow import ArrowStringDtype
 
         dtype = pandas_dtype(dtype)
-        if isinstance(dtype, StringDtype):  # allow conversion to StringArrays
+        # FIXME: Really hard-code here?
+        if isinstance(
+            dtype, (ArrowStringDtype, StringDtype)
+        ):  # allow conversion to StringArrays
             return dtype.construct_array_type()._from_sequence(self, copy=False)
 
         return np.array(self, dtype=dtype, copy=copy)
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
new file mode 100644
index 0000000000000..8248a3e91c0fe
--- /dev/null
+++ b/pandas/core/arrays/string_arrow.py
@@ -0,0 +1,484 @@
+from collections.abc import Iterable
+from typing import Any, Optional, Sequence, Tuple, Type, Union
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.compute as pc
+
+from pandas._libs import missing as libmissing
+from pandas._typing import ArrayLike
+
+from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.dtypes import register_extension_dtype
+
+import pandas as pd
+from pandas.api.types import (
+    is_array_like,
+    is_bool_dtype,
+    is_integer,
+    is_integer_dtype,
+    is_scalar,
+)
+from pandas.core.arrays.base import ExtensionArray
+from pandas.core.indexers import check_array_indexer
+
+
+def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]:
+    scalar = arrow_scalar.as_py()
+    if scalar is None:
+        return libmissing.NA
+    else:
+        return scalar
+
+
+@register_extension_dtype
+class ArrowStringDtype(ExtensionDtype):
+    """
+    Extension dtype for string data in a ``pyarrow.ChunkedArray``.
+
+    .. versionadded:: 1.1.0
+
+    .. warning::
+
+       ArrowStringDtype is considered experimental. The implementation and
+       parts of the API may change without warning.
+
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
+    Examples
+    --------
+    >>> pd.ArrowStringDtype()
+    ArrowStringDtype
+    """
+
+    name = "arrow_string"
+
+    #: StringDtype.na_value uses pandas.NA
+    na_value = libmissing.NA
+
+    @property
+    def type(self) -> Type[str]:
+        return str
+
+    @classmethod
+    def construct_array_type(cls) -> Type["ArrowStringArray"]:
+        """
+        Return the array type associated with this dtype.
+
+        Returns
+        -------
+        type
+        """
+        return ArrowStringArray
+
+    def __hash__(self) -> int:
+        return hash("ArrowStringDtype")
+
+    def __repr__(self) -> str:
+        return "ArrowStringDtype"
+
+    def __from_arrow__(
+        self, array: Union["pa.Array", "pa.ChunkedArray"]
+    ) -> "ArrowStringArray":
+        """
+        Construct StringArray from pyarrow Array/ChunkedArray.
+        """
+        return ArrowStringArray(array)
+
+    def __eq__(self, other) -> bool:
+        """Check whether 'other' is equal to self.
+
+        By default, 'other' is considered equal if
+        * it's a string matching 'self.name'.
+        * it's an instance of this type.
+
+        Parameters
+        ----------
+        other : Any
+
+        Returns
+        -------
+        bool
+        """
+        if isinstance(other, ArrowStringDtype):
+            return True
+        elif isinstance(other, str) and other == "arrow_string":
+            return True
+        else:
+            return False
+
+
+class ArrowStringArray(ExtensionArray):
+    """
+    Extension array for string data in a ``pyarrow.ChunkedArray``.
+
+    .. versionadded:: 1.1.0
+
+    .. warning::
+
+       ArrowStringArray is considered experimental. The implementation and
+       parts of the API may change without warning.
+
+    Parameters
+    ----------
+    values : pyarrow.Array or pyarrow.ChunkedArray
+        The array of data.
+
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
+    See Also
+    --------
+    array
+        The recommended function for creating a ArrowStringArray.
+    Series.str
+        The string methods are available on Series backed by
+        a ArrowStringArray.
+
+    Notes
+    -----
+    ArrowStringArray returns a BooleanArray for comparison methods.
+
+    Examples
+    --------
+    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string")
+    <ArrowStringArray>
+    ['This is', 'some text', <NA>, 'data.']
+    Length: 4, dtype: arrow_string
+    """
+
+    def __init__(self, values):
+        if isinstance(values, pa.Array):
+            self.data = pa.chunked_array([values])
+        elif isinstance(values, pa.ChunkedArray):
+            self.data = values
+        else:
+            raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
+
+    @classmethod
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        # TODO(ARROW-9407): Accept pd.NA in Arrow
+        scalars_corrected = [None if pd.isna(x) else x for x in scalars]
+        return cls(pa.array(scalars_corrected, type=pa.string()))
+
+    @property
+    def dtype(self) -> ArrowStringDtype:
+        """
+        An instance of 'ArrowStringDtype'.
+        """
+        return ArrowStringDtype()
+
+    def __array__(self, *args, **kwargs) -> "np.ndarray":
+        """Correctly construct numpy arrays when passed to `np.asarray()`."""
+        return self.data.__array__(*args, **kwargs)
+
+    def __arrow_array__(self, type=None):
+        """Convert myself to a pyarrow Array or ChunkedArray."""
+        return self.data
+
+    @property
+    def size(self) -> int:
+        """
+        Return the number of elements in this array.
+
+        Returns
+        -------
+        size : int
+        """
+        return len(self.data)
+
+    @property
+    def shape(self) -> Tuple[int]:
+        """Return the shape of the data."""
+        # This may be patched by pandas to support pseudo-2D operations.
+        return (len(self.data),)
+
+    @property
+    def ndim(self) -> int:
+        """Return the number of dimensions of the underlying data."""
+        return 1
+
+    def __len__(self) -> int:
+        """
+        Length of this array.
+
+        Returns
+        -------
+        length : int
+        """
+        return len(self.data)
+
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        return cls._from_sequence(strings, dtype=dtype, copy=copy)
+
+    def __getitem__(self, item):
+        # type (Any) -> Any
+        """Select a subset of self.
+
+        Parameters
+        ----------
+        item : int, slice, or ndarray
+            * int: The position in 'self' to get.
+            * slice: A slice object, where 'start', 'stop', and 'step' are
+              integers or None
+            * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
+
+        Returns
+        -------
+        item : scalar or ExtensionArray
+
+        Notes
+        -----
+        For scalar ``item``, return a scalar value suitable for the array's
+        type. This should be an instance of ``self.dtype.type``.
+        For slice ``key``, return an instance of ``ExtensionArray``, even
+        if the slice is length 0 or 1.
+        For a boolean mask, return an instance of ``ExtensionArray``, filtered
+        to the values where ``item`` is True.
+        """
+        item = check_array_indexer(self, item)
+
+        if isinstance(item, Iterable):
+            if not is_array_like(item):
+                item = np.array(item)
+            if len(item) == 0:
+                return type(self)(pa.chunked_array([], type=pa.string()))
+            elif is_integer_dtype(item):
+                return self.take(item)
+            elif is_bool_dtype(item):
+                return type(self)(self.data.filter(item))
+            else:
+                raise IndexError(
+                    "Only integers, slices and integer or "
+                    "boolean arrays are valid indices."
+                )
+        elif is_integer(item):
+            if item < 0:
+                item += len(self)
+            if item >= len(self):
+                raise IndexError("index out of bounds")
+
+        value = self.data[item]
+        if isinstance(value, pa.ChunkedArray):
+            return type(self)(value)
+        else:
+            return _as_pandas_scalar(value)
+
+    def fillna(self, value=None, method=None, limit=None):
+        raise NotImplementedError("fillna")
+
+    def _reduce(self, name, skipna=True, **kwargs):
+        if name in ["min", "max"]:
+            return getattr(self, name)(skipna=skipna)
+
+        raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
+
+    @property
+    def nbytes(self) -> int:
+        """
+        The number of bytes needed to store this object in memory.
+        """
+        return self.data.nbytes
+
+    def isna(self) -> np.ndarray:
+        """
+        Boolean NumPy array indicating if each value is missing.
+
+        This should return a 1-D array the same length as 'self'.
+        """
+        # TODO: Implement .to_numpy for ChunkedArray
+        return self.data.is_null().to_pandas().values
+
+    def copy(self) -> ExtensionArray:
+        """
+        Return a copy of the array.
+
+        Parameters
+        ----------
+        deep : bool, default False
+            Also copy the underlying data backing this array.
+
+        Returns
+        -------
+        ExtensionArray
+        """
+        return type(self)(self.data)
+
+    def __eq__(self, other: Any) -> ArrayLike:
+        """
+        Return for `self == other` (element-wise equality).
+        """
+        if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
+            return NotImplemented
+        if isinstance(other, ArrowStringArray):
+            result = pc.equal(self.data, other.data)
+        elif is_scalar(other):
+            result = pc.equal(self.data, pa.scalar(other))
+        else:
+            raise NotImplementedError("Neither scalar nor ArrowStringArray")
+
+        # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray
+        return pd.array(result.to_pandas().values)
+
+    def __setitem__(self, key, value):
+        # type: (Union[int, np.ndarray], Any) -> None
+        """Set one or more values inplace.
+
+        Parameters
+        ----------
+        key : int, ndarray, or slice
+            When called from, e.g. ``Series.__setitem__``, ``key`` will be
+            one of
+
+            * scalar int
+            * ndarray of integers.
+            * boolean ndarray
+            * slice object
+
+        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
+            value or values to be set of ``key``.
+
+        Returns
+        -------
+        None
+        """
+        key = check_array_indexer(self, key)
+
+        if is_integer(key):
+            if not pd.api.types.is_scalar(value):
+                raise ValueError("Must pass scalars with scalar indexer")
+            elif pd.isna(value):
+                value = None
+            elif not isinstance(value, str):
+                raise ValueError("Scalar must be NA or str")
+
+            # Slice data and insert inbetween
+            new_data = [
+                *self.data[0:key].chunks,
+                pa.array([value], type=pa.string()),
+                *self.data[(key + 1) :].chunks,
+            ]
+            self.data = pa.chunked_array(new_data)
+        else:
+            # Convert to integer indices and iteratively assign.
+            # TODO: Make a faster variant of this in Arrow upstream.
+            #       This is probably extremely slow.
+
+            # Convert all possible input key types to an array of integers
+            if is_bool_dtype(key):
+                # TODO(ARROW-9430): Directly support setitem(booleans)
+                key_array = np.argwhere(key).flatten()
+            elif isinstance(key, slice):
+                key_array = np.array(range(len(self))[key])
+            else:
+                # TODO(ARROW-9431): Directly support setitem(integers)
+                key_array = np.asanyarray(key)
+
+            if pd.api.types.is_scalar(value):
+                value = np.broadcast_to(value, len(key_array))
+            else:
+                value = np.asarray(value)
+
+            if len(key_array) != len(value):
+                raise ValueError("Length of indexer and values mismatch")
+
+            for k, v in zip(key_array, value):
+                self[k] = v
+
+    def take(
+        self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None
+    ) -> "ExtensionArray":
+        """
+        Take elements from an array.
+
+        Parameters
+        ----------
+        indices : sequence of int
+            Indices to be taken.
+        allow_fill : bool, default False
+            How to handle negative values in `indices`.
+
+            * False: negative values in `indices` indicate positional indices
+              from the right (the default). This is similar to
+              :func:`numpy.take`.
+
+            * True: negative values in `indices` indicate
+              missing values. These values are set to `fill_value`. Any other
+              other negative values raise a ``ValueError``.
+
+        fill_value : any, optional
+            Fill value to use for NA-indices when `allow_fill` is True.
+            This may be ``None``, in which case the default NA value for
+            the type, ``self.dtype.na_value``, is used.
+
+            For many ExtensionArrays, there will be two representations of
+            `fill_value`: a user-facing "boxed" scalar, and a low-level
+            physical NA value. `fill_value` should be the user-facing version,
+            and the implementation should handle translating that to the
+            physical version for processing the take if necessary.
+
+        Returns
+        -------
+        ExtensionArray
+
+        Raises
+        ------
+        IndexError
+            When the indices are out of bounds for the array.
+        ValueError
+            When `indices` contains negative values other than ``-1``
+            and `allow_fill` is True.
+
+        See Also
+        --------
+        numpy.take
+        api.extensions.take
+
+        Notes
+        -----
+        ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
+        ``iloc``, when `indices` is a sequence of values. Additionally,
+        it's called by :meth:`Series.reindex`, or any other method
+        that causes realignment, with a `fill_value`.
+        """
+        # TODO: Remove once we got rid of the (indices < 0) check
+        if not is_array_like(indices):
+            indices_array = np.asanyarray(indices)
+        else:
+            indices_array = indices
+
+        if len(self.data) == 0 and (indices_array >= 0).any():
+            raise IndexError("cannot do a non-empty take")
+        if indices_array.max() >= len(self.data):
+            raise IndexError("out of bounds value in 'indices'.")
+
+        if allow_fill:
+            if (indices_array < 0).any():
+                # TODO(ARROW-9433): Treat negative indices as NULL
+                indices_array = pa.array(indices_array, mask=indices_array < 0)
+                result = self.data.take(indices_array)
+                if pd.isna(fill_value):
+                    return type(self)(result)
+                return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
+            else:
+                # Nothing to fill
+                return type(self)(self.data.take(indices))
+        else:  # allow_fill=False
+            # TODO(ARROW-9432): Treat negative indices as indices from the right.
+            if (indices_array < 0).any():
+                # Don't modify in-place
+                indices_array = np.copy(indices_array)
+                indices_array[indices_array < 0] += len(self.data)
+            return type(self)(self.data.take(indices_array))
diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
new file mode 100644
index 0000000000000..437d51060fb7f
--- /dev/null
+++ b/pandas/tests/extension/test_string_arrow.py
@@ -0,0 +1,125 @@
+import string
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype
+from pandas.tests.extension import base
+
+
+@pytest.fixture
+def dtype():
+    return ArrowStringDtype()
+
+
+@pytest.fixture
+def data():
+    strings = np.random.choice(list(string.ascii_letters), size=100)
+    while strings[0] == strings[1]:
+        strings = np.random.choice(list(string.ascii_letters), size=100)
+
+    return ArrowStringArray._from_sequence(strings)
+
+
+@pytest.fixture
+def data_missing():
+    """Length 2 array with [NA, Valid]"""
+    return ArrowStringArray._from_sequence([pd.NA, "A"])
+
+
+@pytest.fixture
+def data_for_sorting():
+    return ArrowStringArray._from_sequence(["B", "C", "A"])
+
+
+@pytest.fixture
+def data_missing_for_sorting():
+    return ArrowStringArray._from_sequence(["B", pd.NA, "A"])
+
+
+@pytest.fixture
+def na_value():
+    return pd.NA
+
+
+@pytest.fixture
+def data_for_grouping():
+    return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"])
+
+
+class TestDtype(base.BaseDtypeTests):
+    pass
+
+
+class TestInterface(base.BaseInterfaceTests):
+    @pytest.mark.xfail(reason="Fails until implement, remove before merge")
+    def test_view(self, data):
+        base.BaseInterfaceTests.test_view(self, data)
+
+
+class TestConstructors(base.BaseConstructorsTests):
+    pass
+
+
+#  class TestReshaping(base.BaseReshapingTests):
+#     pass
+
+
+class TestGetitem(base.BaseGetitemTests):
+    pass
+
+
+class TestSetitem(base.BaseSetitemTests):
+    pass
+
+
+# class TestMissing(base.BaseMissingTests):
+#     pass
+
+
+# class TestNoReduce(base.BaseNoReduceTests):
+#     @pytest.mark.parametrize("skipna", [True, False])
+#     def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
+#         op_name = all_numeric_reductions
+#
+#         if op_name in ["min", "max"]:
+#             return None
+#
+#         s = pd.Series(data)
+#         with pytest.raises(TypeError):
+#             getattr(s, op_name)(skipna=skipna)
+
+
+# class TestMethods(base.BaseMethodsTests):
+#     @pytest.mark.skip(reason="returns nullable")
+#     def test_value_counts(self, all_data, dropna):
+#         return super().test_value_counts(all_data, dropna)
+
+
+# class TestCasting(base.BaseCastingTests):
+#     pass
+
+
+# class TestComparisonOps(base.BaseComparisonOpsTests):
+#     def _compare_other(self, s, data, op_name, other):
+#         result = getattr(s, op_name)(other)
+#         expected = getattr(s.astype(object), op_name)(other).astype("boolean")
+#         self.assert_series_equal(result, expected)
+
+#     def test_compare_scalar(self, data, all_compare_operators):
+#         op_name = all_compare_operators
+#         s = pd.Series(data)
+#         self._compare_other(s, data, op_name, "abc")
+
+
+# class TestParsing(base.BaseParsingTests):
+#     pass
+
+
+# class TestPrinting(base.BasePrintingTests):
+#     pass
+
+
+# class TestGroupBy(base.BaseGroupbyTests):
+#     pass
diff --git a/setup.py b/setup.py
index f6f0cd9aabc0e..4033ea2935de5 100755
--- a/setup.py
+++ b/setup.py
@@ -432,7 +432,7 @@ def run(self):
         extra_compile_args.append("/Z7")
         extra_link_args.append("/DEBUG")
 else:
-    extra_compile_args = ["-Werror"]
+    extra_compile_args = []
     extra_link_args = []
     if debugging_symbols_requested:
         extra_compile_args.append("-g")

From d242f2d0bc2d0eae9481ce2fa09969d9eb20113c Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 3 Sep 2020 15:32:45 -0500
Subject: [PATCH 02/46] Refactor to use parametrized StringDtype

---
 pandas/core/arrays/base.py                    |  13 +-
 pandas/core/arrays/string_.py                 |  90 +++++++++-
 pandas/core/arrays/string_arrow.py            | 166 +++++++-----------
 pandas/core/config_init.py                    |  13 ++
 pandas/core/strings.py                        |  10 +-
 .../tests/arrays/string_/test_string_arrow.py |  26 +++
 pandas/tests/extension/arrow/test_string.py   |   7 +-
 pandas/tests/extension/test_string_arrow.py   | 103 +++++++----
 setup.py                                      |   2 +-
 9 files changed, 261 insertions(+), 169 deletions(-)
 create mode 100644 pandas/tests/arrays/string_/test_string_arrow.py

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 736d95b4b64b6..9b1b2c0d74e3f 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -356,6 +356,8 @@ def __ne__(self, other: Any) -> ArrayLike:
         """
         Return for `self != other` (element-wise in-equality).
         """
+        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
+            return NotImplemented
         return ~(self == other)
 
     def to_numpy(
@@ -457,13 +459,10 @@ def astype(self, dtype, copy=True):
             NumPy ndarray with 'dtype' for its dtype.
         """
         from pandas.core.arrays.string_ import StringDtype
-        from pandas.core.arrays.string_arrow import ArrowStringDtype
 
         dtype = pandas_dtype(dtype)
         # FIXME: Really hard-code here?
-        if isinstance(
-            dtype, (ArrowStringDtype, StringDtype)
-        ):  # allow conversion to StringArrays
+        if isinstance(dtype, StringDtype):  # allow conversion to StringArrays
             return dtype.construct_array_type()._from_sequence(self, copy=False)
 
         return np.array(self, dtype=dtype, copy=copy)
@@ -928,9 +927,9 @@ def take(
               from the right (the default). This is similar to
               :func:`numpy.take`.
 
-            * True: negative values in `indices` indicate
-              missing values. These values are set to `fill_value`. Any other
-              other negative values raise a ``ValueError``.
+            * True: ``-1`` in `indices` indicate missing values.
+              These values are set to `fill_value`. Any other other negative
+              value raise a ``ValueError``.
 
         fill_value : any, optional
             Fill value to use for NA-indices when `allow_fill` is True.
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 381968f9724b6..0e7c5a8036bcf 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -1,8 +1,10 @@
 import operator
-from typing import TYPE_CHECKING, Type, Union
+from typing import TYPE_CHECKING, Any, Type, Union
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import lib, missing as libmissing
 
 from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype
@@ -50,17 +52,83 @@ class StringDtype(ExtensionDtype):
     StringDtype
     """
 
-    name = "string"
-
     #: StringDtype.na_value uses pandas.NA
     na_value = libmissing.NA
+    _metadata = ("storage",)
+
+    def __init__(self, storage=None):
+        if storage is None:
+            storage = get_option("mode.string_storage")
+        if storage not in {"python", "pyarrow"}:
+            raise ValueError(
+                f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
+            )
+        self.storage = storage
+
+    @property
+    def name(self):
+        return f"StringDtype[{self.storage}]"
 
     @property
     def type(self) -> Type[str]:
         return str
 
     @classmethod
-    def construct_array_type(cls) -> Type["StringArray"]:
+    def construct_from_string(cls, string):
+        """
+        Construct a StringDtype from a string.
+
+        Parameters
+        ----------
+        string : str
+            The type of the name. The storage type will be taking from `string`.
+            Valid options and their storage types are
+
+            ========================== ==============
+            string                     result storage
+            ========================== ==============
+            ``'string'``               global default
+            ``'string[python]'``       python
+            ``'StringDtype[python]'``  python
+            ``'string[pyarrow]'``      pyarrow
+            ``'StringDtype[pyarrow]'`` pyarrow
+            ========================== =============
+
+        Returns
+        -------
+        StringDtype
+
+        Raise
+        -----
+        TypeError
+            If the string is not a valid option.
+
+        """
+        if not isinstance(string, str):
+            raise TypeError(
+                f"'construct_from_string' expects a string, got {type(string)}"
+            )
+        if string == "string":
+            # TODO: use global default
+            return cls()
+        elif string in {"string[python]", "StringDtype[python]"}:
+            return cls(storage="python")
+        elif string in {"string[pyarrow]", "StringDtype[pyarrow]"}:
+            return cls(storage="pyarrow")
+        else:
+            raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
+
+    def __eq__(self, other: Any) -> bool:
+        if isinstance(other, str) and other == "string":
+            return True
+        return super().__eq__(other)
+
+    def __hash__(self) -> int:
+        # custom __eq__ so have to override __hash__
+        return super().__hash__()
+
+    # XXX: this is a classmethod, but we need to know the storage type.
+    def construct_array_type(self) -> Type["StringArray"]:
         """
         Return the array type associated with this dtype.
 
@@ -68,10 +136,15 @@ def construct_array_type(cls) -> Type["StringArray"]:
         -------
         type
         """
-        return StringArray
+        from .string_arrow import ArrowStringArray
+
+        if self.storage == "python":
+            return StringArray
+        else:
+            return ArrowStringArray
 
-    def __repr__(self) -> str:
-        return "StringDtype"
+    def __repr__(self):
+        return self.name
 
     def __from_arrow__(
         self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
@@ -80,6 +153,7 @@ def __from_arrow__(
         Construct StringArray from pyarrow Array/ChunkedArray.
         """
         import pyarrow  # noqa: F811
+        from .string_arrow import ArrowStringArray
 
         if isinstance(array, pyarrow.Array):
             chunks = [array]
@@ -93,7 +167,7 @@ def __from_arrow__(
             str_arr = StringArray._from_sequence(np.array(arr))
             results.append(str_arr)
 
-        return StringArray._concat_same_type(results)
+        return ArrowStringArray._concat_same_type(results)
 
 
 class StringArray(PandasArray):
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 8248a3e91c0fe..c0831a65b3644 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,5 +1,5 @@
 from collections.abc import Iterable
-from typing import Any, Optional, Sequence, Tuple, Type, Union
+from typing import Any, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import pyarrow as pa
@@ -8,18 +8,19 @@
 from pandas._libs import missing as libmissing
 from pandas._typing import ArrayLike
 
-from pandas.core.dtypes.base import ExtensionDtype
-from pandas.core.dtypes.dtypes import register_extension_dtype
+from pandas.core.dtypes.missing import isna
 
-import pandas as pd
 from pandas.api.types import (
     is_array_like,
     is_bool_dtype,
+    is_int64_dtype,
     is_integer,
     is_integer_dtype,
     is_scalar,
 )
+from pandas.core.algorithms import factorize
 from pandas.core.arrays.base import ExtensionArray
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.indexers import check_array_indexer
 
 
@@ -31,89 +32,6 @@ def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]:
         return scalar
 
 
-@register_extension_dtype
-class ArrowStringDtype(ExtensionDtype):
-    """
-    Extension dtype for string data in a ``pyarrow.ChunkedArray``.
-
-    .. versionadded:: 1.1.0
-
-    .. warning::
-
-       ArrowStringDtype is considered experimental. The implementation and
-       parts of the API may change without warning.
-
-    Attributes
-    ----------
-    None
-
-    Methods
-    -------
-    None
-
-    Examples
-    --------
-    >>> pd.ArrowStringDtype()
-    ArrowStringDtype
-    """
-
-    name = "arrow_string"
-
-    #: StringDtype.na_value uses pandas.NA
-    na_value = libmissing.NA
-
-    @property
-    def type(self) -> Type[str]:
-        return str
-
-    @classmethod
-    def construct_array_type(cls) -> Type["ArrowStringArray"]:
-        """
-        Return the array type associated with this dtype.
-
-        Returns
-        -------
-        type
-        """
-        return ArrowStringArray
-
-    def __hash__(self) -> int:
-        return hash("ArrowStringDtype")
-
-    def __repr__(self) -> str:
-        return "ArrowStringDtype"
-
-    def __from_arrow__(
-        self, array: Union["pa.Array", "pa.ChunkedArray"]
-    ) -> "ArrowStringArray":
-        """
-        Construct StringArray from pyarrow Array/ChunkedArray.
-        """
-        return ArrowStringArray(array)
-
-    def __eq__(self, other) -> bool:
-        """Check whether 'other' is equal to self.
-
-        By default, 'other' is considered equal if
-        * it's a string matching 'self.name'.
-        * it's an instance of this type.
-
-        Parameters
-        ----------
-        other : Any
-
-        Returns
-        -------
-        bool
-        """
-        if isinstance(other, ArrowStringDtype):
-            return True
-        elif isinstance(other, str) and other == "arrow_string":
-            return True
-        else:
-            return False
-
-
 class ArrowStringArray(ExtensionArray):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
@@ -165,19 +83,20 @@ def __init__(self, values):
             self.data = values
         else:
             raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
+        self._dtype = StringDtype(storage="pyarrow")
 
     @classmethod
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         # TODO(ARROW-9407): Accept pd.NA in Arrow
-        scalars_corrected = [None if pd.isna(x) else x for x in scalars]
+        scalars_corrected = [None if isna(x) else x for x in scalars]
         return cls(pa.array(scalars_corrected, type=pa.string()))
 
     @property
-    def dtype(self) -> ArrowStringDtype:
+    def dtype(self) -> StringDtype:
         """
-        An instance of 'ArrowStringDtype'.
+        An instance of 'StringDtype'.
         """
-        return ArrowStringDtype()
+        return self._dtype
 
     def __array__(self, *args, **kwargs) -> "np.ndarray":
         """Correctly construct numpy arrays when passed to `np.asarray()`."""
@@ -276,15 +195,6 @@ def __getitem__(self, item):
         else:
             return _as_pandas_scalar(value)
 
-    def fillna(self, value=None, method=None, limit=None):
-        raise NotImplementedError("fillna")
-
-    def _reduce(self, name, skipna=True, **kwargs):
-        if name in ["min", "max"]:
-            return getattr(self, name)(skipna=skipna)
-
-        raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
-
     @property
     def nbytes(self) -> int:
         """
@@ -320,7 +230,9 @@ def __eq__(self, other: Any) -> ArrayLike:
         """
         Return for `self == other` (element-wise equality).
         """
-        if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
+        from pandas import array, Series, DataFrame, Index
+
+        if isinstance(other, (Series, DataFrame, Index)):
             return NotImplemented
         if isinstance(other, ArrowStringArray):
             result = pc.equal(self.data, other.data)
@@ -330,7 +242,7 @@ def __eq__(self, other: Any) -> ArrayLike:
             raise NotImplementedError("Neither scalar nor ArrowStringArray")
 
         # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray
-        return pd.array(result.to_pandas().values)
+        return array(result.to_pandas().values, dtype="boolean")
 
     def __setitem__(self, key, value):
         # type: (Union[int, np.ndarray], Any) -> None
@@ -357,9 +269,9 @@ def __setitem__(self, key, value):
         key = check_array_indexer(self, key)
 
         if is_integer(key):
-            if not pd.api.types.is_scalar(value):
+            if not is_scalar(value):
                 raise ValueError("Must pass scalars with scalar indexer")
-            elif pd.isna(value):
+            elif isna(value):
                 value = None
             elif not isinstance(value, str):
                 raise ValueError("Scalar must be NA or str")
@@ -386,7 +298,7 @@ def __setitem__(self, key, value):
                 # TODO(ARROW-9431): Directly support setitem(integers)
                 key_array = np.asanyarray(key)
 
-            if pd.api.types.is_scalar(value):
+            if is_scalar(value):
                 value = np.broadcast_to(value, len(key_array))
             else:
                 value = np.asarray(value)
@@ -461,15 +373,20 @@ def take(
 
         if len(self.data) == 0 and (indices_array >= 0).any():
             raise IndexError("cannot do a non-empty take")
-        if indices_array.max() >= len(self.data):
+        if len(indices_array) > 0 and indices_array.max() >= len(self.data):
             raise IndexError("out of bounds value in 'indices'.")
 
         if allow_fill:
             if (indices_array < 0).any():
+                if indices_array.min() < -1:
+                    raise ValueError(
+                        "'indicies' contains negative values other "
+                        "-1 with 'allow_fill=True."
+                    )
                 # TODO(ARROW-9433): Treat negative indices as NULL
                 indices_array = pa.array(indices_array, mask=indices_array < 0)
                 result = self.data.take(indices_array)
-                if pd.isna(fill_value):
+                if isna(fill_value):
                     return type(self)(result)
                 return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
             else:
@@ -482,3 +399,38 @@ def take(
                 indices_array = np.copy(indices_array)
                 indices_array[indices_array < 0] += len(self.data)
             return type(self)(self.data.take(indices_array))
+
+    def value_counts(self, dropna=True):
+        from pandas import Series
+
+        if dropna:
+            na = self.isna()
+            self = self[~na]
+        counts = self.data.value_counts()
+        return Series(counts.field(1), counts.field(0))
+
+    def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]:
+        # see https://github.com/xhochy/fletcher/blob/master/fletcher/base.py
+        # doesn't handle dictionary types.
+        if self.data.num_chunks == 1:
+            encoded = self.data.chunk(0).dictionary_encode()
+            indices = encoded.indices.to_pandas()
+            if indices.dtype.kind == "f":
+                indices[np.isnan(indices)] = na_sentinel
+                indices = indices.astype(int)
+            if not is_int64_dtype(indices):
+                indices = indices.astype(np.int64)
+            return indices.values, type(self)(encoded.dictionary)
+        else:
+            np_array = self.data.to_pandas().values
+            return factorize(np_array, na_sentinel=na_sentinel)
+
+    @classmethod
+    def _concat_same_type(
+        cls, to_concat: Sequence["ArrowStringArray"]
+    ) -> "ArrowStringArray":
+        return cls(
+            pa.chunked_array(
+                [array for ea in to_concat for array in ea.data.iterchunks()]
+            )
+        )
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 0c23f1b4bcdf2..a58e6eccf7644 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -504,6 +504,19 @@ def use_inf_as_na_cb(key):
     )
 
 
+string_storage_doc = """
+: string
+    The default storage for StringDtype.
+"""
+
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "string_storage",
+        "python",
+        string_storage_doc,
+        validator=is_one_of_factory(["python", "pyarrow"]),
+    )
+
 # Set up the io.excel specific reader configuration.
 reader_engine_doc = """
 : string
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 6702bf519c52e..59aa8fc5cfa0e 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -901,8 +901,10 @@ def _result_dtype(arr):
     # workaround #27953
     # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
     # when the list of values is empty.
-    if arr.dtype.name == "string":
-        return "string"
+    from pandas.core.arrays.string_ import StringDtype
+
+    if isinstance(arr.dtype.name, StringDtype):
+        return arr.dtype.name
     else:
         return object
 
@@ -2097,9 +2099,11 @@ class StringMethods(NoNewAttributesMixin):
     """
 
     def __init__(self, data):
+        from pandas.core.arrays.string_ import StringDtype
+
         self._inferred_dtype = self._validate(data)
         self._is_categorical = is_categorical_dtype(data.dtype)
-        self._is_string = data.dtype.name == "string"
+        self._is_string = isinstance(data.dtype, StringDtype)
 
         # ._values.categories works for both Series/Index
         self._parent = data._values.categories if self._is_categorical else data
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
new file mode 100644
index 0000000000000..40e3f21670ea0
--- /dev/null
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -0,0 +1,26 @@
+import pytest
+
+import pandas as pd
+import pandas.testing as tm
+
+
+def test_eq_all_na():
+    a = pd.array([pd.NA, pd.NA], dtype=pd.StringDtype("pyarrow"))
+    result = a == a
+    expected = pd.array([pd.NA, pd.NA], dtype="boolean")
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_config():
+    # python by default
+    assert pd.StringDtype().storage == "python"
+    arr = pd.array(["a", "b"])
+    assert arr.dtype.storage == "python"
+
+    with pd.option_context("mode.string_storage", "pyarrow"):
+        assert pd.StringDtype().storage == "pyarrow"
+        arr = pd.array(["a", "b"])
+        assert arr.dtype.storage == "pyarrow"
+
+    with pytest.raises(ValueError):
+        pd.options.mode.string_storage = "foo"
diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py
index abd5c1f386dc5..f32f1e415ddc7 100644
--- a/pandas/tests/extension/arrow/test_string.py
+++ b/pandas/tests/extension/arrow/test_string.py
@@ -4,10 +4,9 @@
 
 pytest.importorskip("pyarrow", minversion="0.13.0")
 
-from .arrays import ArrowStringDtype  # isort:skip
-
 
 def test_constructor_from_list():
     # GH 27673
-    result = pd.Series(["E"], dtype=ArrowStringDtype())
-    assert isinstance(result.dtype, ArrowStringDtype)
+    result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow"))
+    assert isinstance(result.dtype, pd.StringDtype)
+    assert result.dtype.storage == "pyarrow"
diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
index 437d51060fb7f..848e8a435b530 100644
--- a/pandas/tests/extension/test_string_arrow.py
+++ b/pandas/tests/extension/test_string_arrow.py
@@ -4,13 +4,13 @@
 import pytest
 
 import pandas as pd
-from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype
+from pandas.core.arrays.string_arrow import ArrowStringArray
 from pandas.tests.extension import base
 
 
 @pytest.fixture
 def dtype():
-    return ArrowStringDtype()
+    return pd.StringDtype(storage="pyarrow")
 
 
 @pytest.fixture
@@ -62,64 +62,89 @@ class TestConstructors(base.BaseConstructorsTests):
     pass
 
 
-#  class TestReshaping(base.BaseReshapingTests):
-#     pass
+class TestReshaping(base.BaseReshapingTests):
+    pass
 
 
 class TestGetitem(base.BaseGetitemTests):
-    pass
+    @pytest.mark.xfail(
+        reason="pyarrow.lib.ArrowNotImplementedError: Function "
+        "fill_null has no kernel matching input types "
+        "(array[string], scalar[string])"
+    )
+    def test_take_non_na_fill_value(self, data_missing):
+        super().test_take_non_na_fill_value(data_missing)
+
+    @pytest.mark.xfail(
+        reason="pyarrow.lib.ArrowNotImplementedError: Function fill_null has no "
+        "kernel matching input types (array[string], scalar[string])"
+    )
+    def test_reindex_non_na_fill_value(self, data_missing):
+        super().test_reindex_non_na_fill_value(self, data_missing)
 
 
 class TestSetitem(base.BaseSetitemTests):
+    @pytest.mark.xfail(reason="TODO")
+    def test_setitem_preserves_views(self, data):
+        # Unclear where the issue is (pyarrow getitem, our getitem, our slice)
+        # and what to do here.
+        super().test_setitem_preserves_views(data)
+
+
+class TestMissing(base.BaseMissingTests):
     pass
 
 
-# class TestMissing(base.BaseMissingTests):
-#     pass
+class TestNoReduce(base.BaseNoReduceTests):
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
+        op_name = all_numeric_reductions
 
+        if op_name in ["min", "max"]:
+            return None
 
-# class TestNoReduce(base.BaseNoReduceTests):
-#     @pytest.mark.parametrize("skipna", [True, False])
-#     def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
-#         op_name = all_numeric_reductions
-#
-#         if op_name in ["min", "max"]:
-#             return None
-#
-#         s = pd.Series(data)
-#         with pytest.raises(TypeError):
-#             getattr(s, op_name)(skipna=skipna)
+        s = pd.Series(data)
+        with pytest.raises(TypeError):
+            getattr(s, op_name)(skipna=skipna)
 
 
-# class TestMethods(base.BaseMethodsTests):
-#     @pytest.mark.skip(reason="returns nullable")
-#     def test_value_counts(self, all_data, dropna):
-#         return super().test_value_counts(all_data, dropna)
+class TestMethods(base.BaseMethodsTests):
+    @pytest.mark.skip(reason="returns nullable")
+    def test_value_counts(self, all_data, dropna):
+        return super().test_value_counts(all_data, dropna)
 
 
-# class TestCasting(base.BaseCastingTests):
-#     pass
+class TestCasting(base.BaseCastingTests):
+    pass
 
 
-# class TestComparisonOps(base.BaseComparisonOpsTests):
-#     def _compare_other(self, s, data, op_name, other):
-#         result = getattr(s, op_name)(other)
-#         expected = getattr(s.astype(object), op_name)(other).astype("boolean")
-#         self.assert_series_equal(result, expected)
+class TestComparisonOps(base.BaseComparisonOpsTests):
+    def _compare_other(self, s, data, op_name, other):
+        if op_name not in {"__eq__", "__ne__"}:
+            pytest.skip(f"{op_name} is not implemented.")
+        result = getattr(s, op_name)(other)
+        expected = getattr(s.astype(object), op_name)(other).astype("boolean")
+        self.assert_series_equal(result, expected)
 
-#     def test_compare_scalar(self, data, all_compare_operators):
-#         op_name = all_compare_operators
-#         s = pd.Series(data)
-#         self._compare_other(s, data, op_name, "abc")
+    def test_compare_scalar(self, data, all_compare_operators):
+        op_name = all_compare_operators
+        s = pd.Series(data)
+        self._compare_other(s, data, op_name, "abc")
 
+    def test_compare_array(self, data, all_compare_operators):
+        op_name = all_compare_operators
+        s = pd.Series(data)
+        other = pd.Series([data[0]] * len(data), dtype=data.dtype)
+        self._compare_other(s, data, op_name, other)
 
-# class TestParsing(base.BaseParsingTests):
-#     pass
+
+class TestParsing(base.BaseParsingTests):
+    pass
 
 
-# class TestPrinting(base.BasePrintingTests):
-#     pass
+class TestPrinting(base.BasePrintingTests):
+    pass
 
 
-# class TestGroupBy(base.BaseGroupbyTests):
-#     pass
+class TestGroupBy(base.BaseGroupbyTests):
+    pass
diff --git a/setup.py b/setup.py
index 4033ea2935de5..f6f0cd9aabc0e 100755
--- a/setup.py
+++ b/setup.py
@@ -432,7 +432,7 @@ def run(self):
         extra_compile_args.append("/Z7")
         extra_link_args.append("/DEBUG")
 else:
-    extra_compile_args = []
+    extra_compile_args = ["-Werror"]
     extra_link_args = []
     if debugging_symbols_requested:
         extra_compile_args.append("-g")

From 236781065c7ea739a05fc108994c6e02244d13b7 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 18 Feb 2021 16:53:37 +0000
Subject: [PATCH 03/46] abs-imports

---
 pandas/core/arrays/string_.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 6b724cc147a7b..2e40eab2d528e 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -166,7 +166,7 @@ def construct_array_type(self) -> Type[StringArray]:
         -------
         type
         """
-        from .string_arrow import ArrowStringArray
+        from pandas.core.arrays.string_arrow import ArrowStringArray
 
         if self.storage == "python":
             return StringArray
@@ -184,7 +184,7 @@ def __from_arrow__(
         """
         import pyarrow
 
-        from .string_arrow import ArrowStringArray
+        from pandas.core.arrays.string_arrow import ArrowStringArray
 
         if isinstance(array, pyarrow.Array):
             chunks = [array]

From 9166d3b431ebda5781c3d4a8dc0cac2225696dd5 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 19 Feb 2021 13:45:26 +0000
Subject: [PATCH 04/46] post merge fixup

---
 pandas/core/arrays/string_.py                 |  12 +-
 pandas/core/arrays/string_arrow.py            |   2 +-
 pandas/tests/arrays/string_/test_string.py    |  59 +++----
 .../tests/arrays/string_/test_string_arrow.py |   3 +-
 pandas/tests/extension/test_string.py         |  19 +--
 pandas/tests/extension/test_string_arrow.py   | 150 ------------------
 6 files changed, 43 insertions(+), 202 deletions(-)
 delete mode 100644 pandas/tests/extension/test_string_arrow.py

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2e40eab2d528e..7aaa3c32c84dc 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -52,6 +52,8 @@
 if TYPE_CHECKING:
     import pyarrow
 
+    from pandas.core.arrays.string_arrow import ArrowStringArray
+
 
 @register_extension_dtype
 class StringDtype(ExtensionDtype):
@@ -157,8 +159,12 @@ def __hash__(self) -> int:
         # custom __eq__ so have to override __hash__
         return super().__hash__()
 
-    # XXX: this is a classmethod, but we need to know the storage type.
-    def construct_array_type(self) -> Type[StringArray]:
+    # TODO: this is a classmethod, but we need to know the storage type.
+    # error: Signature of "construct_array_type" incompatible with supertype
+    # "ExtensionDtype"
+    def construct_array_type(  # type: ignore[override]
+        self,
+    ) -> Type[StringArray | ArrowStringArray]:
         """
         Return the array type associated with this dtype.
 
@@ -178,7 +184,7 @@ def __repr__(self):
 
     def __from_arrow__(
         self, array: Union[pyarrow.Array, pyarrow.ChunkedArray]
-    ) -> StringArray:
+    ) -> ArrowStringArray:
         """
         Construct StringArray from pyarrow Array/ChunkedArray.
         """
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 27fc52418e246..db2bfa8c5771e 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -136,7 +136,7 @@ def _from_sequence_of_strings(
     @property
     def dtype(self) -> StringDtype:
         """
-        An instance of 'ArrowStringDtype'.
+        An instance of 'StringDtype[pyarrow]'.
         """
         return self._dtype
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index d5254adc1ee24..27325c6b4b7ea 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -9,27 +9,14 @@
 
 import pandas as pd
 import pandas._testing as tm
-from pandas.core.arrays.string_arrow import (
-    ArrowStringArray,
-    ArrowStringDtype,
-)
+from pandas.core.arrays.string_arrow import ArrowStringArray
 
 skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0")
 
 
-@pytest.fixture(
-    params=["string", pytest.param("arrow_string", marks=skip_if_no_pyarrow)]
-)
+@pytest.fixture(params=["python", pytest.param("pyarrow", marks=skip_if_no_pyarrow)])
 def dtype(request):
-    return request.param
-
-
-@pytest.fixture
-def dtype_object(dtype):
-    if dtype == "string":
-        return pd.StringDtype
-    else:
-        return ArrowStringDtype
+    return pd.StringDtype(storage=request.param)
 
 
 @pytest.fixture(
@@ -43,7 +30,7 @@ def cls(request):
 
 
 def test_repr(dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         reason = (
             "AssertionError: assert '      A\n0     a\n1  None\n2     b' "
             "== '      A\n0     a\n1  <NA>\n2     b'"
@@ -55,10 +42,10 @@ def test_repr(dtype, request):
     expected = "      A\n0     a\n1  <NA>\n2     b"
     assert repr(df) == expected
 
-    expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
+    expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: StringDtype[python]"
     assert repr(df.A) == expected
 
-    expected = "<StringArray>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
+    expected = "<StringArray>\n['a', <NA>, 'b']\nLength: 3, dtype: StringDtype[python]"
     assert repr(df.A.array) == expected
 
 
@@ -104,7 +91,7 @@ def test_setitem_with_scalar_string(dtype):
     ],
 )
 def test_string_methods(input, method, dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         reason = "AttributeError: 'ArrowStringDtype' object has no attribute 'base'"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -119,7 +106,7 @@ def test_string_methods(input, method, dtype, request):
 
 
 def test_astype_roundtrip(dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         reason = "ValueError: Could not convert object to NumPy datetime"
         mark = pytest.mark.xfail(reason=reason, raises=ValueError)
         request.node.add_marker(mark)
@@ -140,7 +127,7 @@ def test_astype_roundtrip(dtype, request):
 
 
 def test_add(dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         reason = (
             "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' and "
             "'ArrowStringArray'"
@@ -168,7 +155,7 @@ def test_add(dtype, request):
 
 
 def test_add_2d(dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         reason = "Failed: DID NOT RAISE <class 'ValueError'>"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -184,7 +171,7 @@ def test_add_2d(dtype, request):
 
 
 def test_add_sequence(dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         reason = (
             "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' "
             "and 'list'"
@@ -205,7 +192,7 @@ def test_add_sequence(dtype, request):
 
 
 def test_mul(dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         reason = (
             "TypeError: unsupported operand type(s) for *: 'ArrowStringArray' and 'int'"
         )
@@ -288,7 +275,7 @@ def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, requ
 
 
 def test_comparison_methods_array(all_compare_operators, dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         if all_compare_operators in ["__eq__", "__ne__"]:
             reason = "NotImplementedError: Neither scalar nor ArrowStringArray"
         else:
@@ -359,7 +346,7 @@ def test_from_sequence_no_mutate(copy, cls, request):
 
 
 def test_astype_int(dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         reason = "TypeError: Cannot interpret 'Int64Dtype()' as a data type"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -391,7 +378,7 @@ def test_reduce(skipna, dtype):
 @pytest.mark.parametrize("method", ["min", "max"])
 @pytest.mark.parametrize("skipna", [True, False])
 def test_min_max(method, skipna, dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -408,7 +395,7 @@ def test_min_max(method, skipna, dtype, request):
 @pytest.mark.parametrize("method", ["min", "max"])
 @pytest.mark.parametrize("box", [pd.Series, pd.array])
 def test_min_max_numpy(method, box, dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         if box is pd.array:
             reason = (
                 "TypeError: '<=' not supported between instances of 'str' and "
@@ -462,14 +449,14 @@ def test_arrow_array(dtype):
     data = pd.array(["a", "b", "c"], dtype=dtype)
     arr = pa.array(data)
     expected = pa.array(list(data), type=pa.string(), from_pandas=True)
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         expected = pa.chunked_array(expected)
 
     assert arr.equals(expected)
 
 
 @td.skip_if_no("pyarrow", min_version="0.15.1.dev")
-def test_arrow_roundtrip(dtype, dtype_object):
+def test_arrow_roundtrip(dtype):
     # roundtrip possible from arrow 1.0.0
     import pyarrow as pa
 
@@ -478,14 +465,14 @@ def test_arrow_roundtrip(dtype, dtype_object):
     table = pa.table(df)
     assert table.field("a").type == "string"
     result = table.to_pandas()
-    assert isinstance(result["a"].dtype, dtype_object)
+    assert isinstance(result["a"].dtype, type(dtype))
     tm.assert_frame_equal(result, df)
     # ensure the missing value is represented by NA and not np.nan or None
     assert result.loc[2, "a"] is pd.NA
 
 
 def test_value_counts_na(dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         reason = "TypeError: boolean value of NA is ambiguous"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -501,7 +488,7 @@ def test_value_counts_na(dtype, request):
 
 
 def test_value_counts_with_normalize(dtype, request):
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         reason = "TypeError: boolean value of NA is ambiguous"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -535,10 +522,10 @@ def test_use_inf_as_na(values, expected, dtype):
         tm.assert_frame_equal(result, expected)
 
 
-def test_memory_usage(dtype, request):
+def test_memory_usage(dtype):
     # GH 33963
 
-    if dtype == "arrow_string":
+    if dtype.storage == "pyarrow":
         pytest.skip("not applicable")
 
     series = pd.Series(["a", "b", "c"], dtype=dtype)
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index afe2394484fda..b5d0627c8583c 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -29,7 +29,8 @@ def test_config():
         arr = pd.array(["a", "b"])
         assert arr.dtype.storage == "pyarrow"
 
-    with pytest.raises(ValueError):
+    msg = re.escape("Value must be one of python|pyarrow")
+    with pytest.raises(ValueError, match=msg):
         pd.options.mode.string_storage = "foo"
 
 
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index d0a3ef17afdbc..a14e9af1b6abf 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -22,20 +22,17 @@
 
 import pandas as pd
 from pandas.core.arrays.string_ import StringDtype
-from pandas.core.arrays.string_arrow import ArrowStringDtype
 from pandas.tests.extension import base
 
 
 @pytest.fixture(
     params=[
-        StringDtype,
-        pytest.param(
-            ArrowStringDtype, marks=td.skip_if_no("pyarrow", min_version="1.0.0")
-        ),
+        "python",
+        pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")),
     ]
 )
 def dtype(request):
-    return request.param()
+    return StringDtype(storage=request.param)
 
 
 @pytest.fixture
@@ -81,7 +78,7 @@ class TestDtype(base.BaseDtypeTests):
 
 class TestInterface(base.BaseInterfaceTests):
     def test_view(self, data, request):
-        if isinstance(data.dtype, ArrowStringDtype):
+        if data.dtype.storage == "pyarrow":
             mark = pytest.mark.xfail(reason="not implemented")
             request.node.add_marker(mark)
         super().test_view(data)
@@ -92,8 +89,8 @@ class TestConstructors(base.BaseConstructorsTests):
 
 
 class TestReshaping(base.BaseReshapingTests):
-    def test_transpose(self, data, dtype, request):
-        if isinstance(dtype, ArrowStringDtype):
+    def test_transpose(self, data, request):
+        if data.dtype.storage == "pyarrow":
             mark = pytest.mark.xfail(reason="not implemented")
             request.node.add_marker(mark)
         super().test_transpose(data)
@@ -104,8 +101,8 @@ class TestGetitem(base.BaseGetitemTests):
 
 
 class TestSetitem(base.BaseSetitemTests):
-    def test_setitem_preserves_views(self, data, dtype, request):
-        if isinstance(dtype, ArrowStringDtype):
+    def test_setitem_preserves_views(self, data, request):
+        if data.dtype.storage == "pyarrow":
             mark = pytest.mark.xfail(reason="not implemented")
             request.node.add_marker(mark)
         super().test_setitem_preserves_views(data)
diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
deleted file mode 100644
index 848e8a435b530..0000000000000
--- a/pandas/tests/extension/test_string_arrow.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import string
-
-import numpy as np
-import pytest
-
-import pandas as pd
-from pandas.core.arrays.string_arrow import ArrowStringArray
-from pandas.tests.extension import base
-
-
-@pytest.fixture
-def dtype():
-    return pd.StringDtype(storage="pyarrow")
-
-
-@pytest.fixture
-def data():
-    strings = np.random.choice(list(string.ascii_letters), size=100)
-    while strings[0] == strings[1]:
-        strings = np.random.choice(list(string.ascii_letters), size=100)
-
-    return ArrowStringArray._from_sequence(strings)
-
-
-@pytest.fixture
-def data_missing():
-    """Length 2 array with [NA, Valid]"""
-    return ArrowStringArray._from_sequence([pd.NA, "A"])
-
-
-@pytest.fixture
-def data_for_sorting():
-    return ArrowStringArray._from_sequence(["B", "C", "A"])
-
-
-@pytest.fixture
-def data_missing_for_sorting():
-    return ArrowStringArray._from_sequence(["B", pd.NA, "A"])
-
-
-@pytest.fixture
-def na_value():
-    return pd.NA
-
-
-@pytest.fixture
-def data_for_grouping():
-    return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"])
-
-
-class TestDtype(base.BaseDtypeTests):
-    pass
-
-
-class TestInterface(base.BaseInterfaceTests):
-    @pytest.mark.xfail(reason="Fails until implement, remove before merge")
-    def test_view(self, data):
-        base.BaseInterfaceTests.test_view(self, data)
-
-
-class TestConstructors(base.BaseConstructorsTests):
-    pass
-
-
-class TestReshaping(base.BaseReshapingTests):
-    pass
-
-
-class TestGetitem(base.BaseGetitemTests):
-    @pytest.mark.xfail(
-        reason="pyarrow.lib.ArrowNotImplementedError: Function "
-        "fill_null has no kernel matching input types "
-        "(array[string], scalar[string])"
-    )
-    def test_take_non_na_fill_value(self, data_missing):
-        super().test_take_non_na_fill_value(data_missing)
-
-    @pytest.mark.xfail(
-        reason="pyarrow.lib.ArrowNotImplementedError: Function fill_null has no "
-        "kernel matching input types (array[string], scalar[string])"
-    )
-    def test_reindex_non_na_fill_value(self, data_missing):
-        super().test_reindex_non_na_fill_value(self, data_missing)
-
-
-class TestSetitem(base.BaseSetitemTests):
-    @pytest.mark.xfail(reason="TODO")
-    def test_setitem_preserves_views(self, data):
-        # Unclear where the issue is (pyarrow getitem, our getitem, our slice)
-        # and what to do here.
-        super().test_setitem_preserves_views(data)
-
-
-class TestMissing(base.BaseMissingTests):
-    pass
-
-
-class TestNoReduce(base.BaseNoReduceTests):
-    @pytest.mark.parametrize("skipna", [True, False])
-    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
-        op_name = all_numeric_reductions
-
-        if op_name in ["min", "max"]:
-            return None
-
-        s = pd.Series(data)
-        with pytest.raises(TypeError):
-            getattr(s, op_name)(skipna=skipna)
-
-
-class TestMethods(base.BaseMethodsTests):
-    @pytest.mark.skip(reason="returns nullable")
-    def test_value_counts(self, all_data, dropna):
-        return super().test_value_counts(all_data, dropna)
-
-
-class TestCasting(base.BaseCastingTests):
-    pass
-
-
-class TestComparisonOps(base.BaseComparisonOpsTests):
-    def _compare_other(self, s, data, op_name, other):
-        if op_name not in {"__eq__", "__ne__"}:
-            pytest.skip(f"{op_name} is not implemented.")
-        result = getattr(s, op_name)(other)
-        expected = getattr(s.astype(object), op_name)(other).astype("boolean")
-        self.assert_series_equal(result, expected)
-
-    def test_compare_scalar(self, data, all_compare_operators):
-        op_name = all_compare_operators
-        s = pd.Series(data)
-        self._compare_other(s, data, op_name, "abc")
-
-    def test_compare_array(self, data, all_compare_operators):
-        op_name = all_compare_operators
-        s = pd.Series(data)
-        other = pd.Series([data[0]] * len(data), dtype=data.dtype)
-        self._compare_other(s, data, op_name, other)
-
-
-class TestParsing(base.BaseParsingTests):
-    pass
-
-
-class TestPrinting(base.BasePrintingTests):
-    pass
-
-
-class TestGroupBy(base.BaseGroupbyTests):
-    pass

From 8760705eef02a1122a44927b0d12b0c83c141010 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 19 Feb 2021 17:07:02 +0000
Subject: [PATCH 05/46] StringDtype[python] -> string[python]

---
 pandas/core/arrays/string_.py              | 4 +---
 pandas/core/construction.py                | 2 +-
 pandas/tests/arrays/string_/test_string.py | 4 ++--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 7aaa3c32c84dc..fa7d1ec5bf417 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -99,7 +99,7 @@ def __init__(self, storage=None):
 
     @property
     def name(self):
-        return f"StringDtype[{self.storage}]"
+        return f"string[{self.storage}]"
 
     @property
     def type(self) -> Type[str]:
@@ -121,9 +121,7 @@ def construct_from_string(cls, string):
             ========================== ==============
             ``'string'``               global default
             ``'string[python]'``       python
-            ``'StringDtype[python]'``  python
             ``'string[pyarrow]'``      pyarrow
-            ``'StringDtype[pyarrow]'`` pyarrow
             ========================== =============
 
         Returns
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index dd75473da6d78..189a6ccc0f884 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -239,7 +239,7 @@ def array(
     >>> pd.array(["a", None, "c"])
     <StringArray>
     ['a', <NA>, 'c']
-    Length: 3, dtype: string
+    Length: 3, dtype: string[python]
 
     >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
     <PeriodArray>
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 27325c6b4b7ea..1a3064c02810a 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -42,10 +42,10 @@ def test_repr(dtype, request):
     expected = "      A\n0     a\n1  <NA>\n2     b"
     assert repr(df) == expected
 
-    expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: StringDtype[python]"
+    expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string[python]"
     assert repr(df.A) == expected
 
-    expected = "<StringArray>\n['a', <NA>, 'b']\nLength: 3, dtype: StringDtype[python]"
+    expected = "<StringArray>\n['a', <NA>, 'b']\nLength: 3, dtype: string[python]"
     assert repr(df.A.array) == expected
 
 

From 2c657df7ff233256c271b7ff79325bacbd5a2d57 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 22 Mar 2021 16:06:12 +0000
Subject: [PATCH 06/46] pre-commit fix for inconsistent use of pandas namespace

---
 pandas/tests/arrays/string_/test_string_arrow.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index b5d0627c8583c..7565e1aa0488b 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -35,14 +35,14 @@ def test_config():
 
 
 @pytest.mark.parametrize("chunked", [True, False])
-@pytest.mark.parametrize("array", [np, pa])
-def test_constructor_not_string_type_raises(array, chunked):
-    arr = array.array([1, 2, 3])
+@pytest.mark.parametrize("np_or_pa", [np, pa])
+def test_constructor_not_string_type_raises(np_or_pa, chunked):
+    arr = np_or_pa.array([1, 2, 3])
     if chunked:
-        if array is np:
+        if np_or_pa is np:
             pytest.skip("chunked not applicable to numpy array")
         arr = pa.chunked_array(arr)
-    if array is np:
+    if np_or_pa is np:
         msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowStringArray"
     else:
         msg = re.escape(

From 647a6c2e0699475ab282aad993d19d4ca5c5da2f Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 22 Mar 2021 16:12:06 +0000
Subject: [PATCH 07/46] fix typo

---
 pandas/core/arrays/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 6607d1aeac3b7..ae06004dfb485 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1059,8 +1059,8 @@ def take(
               :func:`numpy.take`.
 
             * True: ``-1`` in `indices` indicate missing values.
-              These values are set to `fill_value`. Any other other negative
-              value raise a ``ValueError``.
+              These values are set to `fill_value`. Any other negative
+              value raises a ``ValueError``.
 
         fill_value : any, optional
             Fill value to use for NA-indices when `allow_fill` is True.

From 0596fd7ec634b9af983a11b2b3ea28a1724cbc57 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 22 Mar 2021 16:23:57 +0000
Subject: [PATCH 08/46] pre-commit fixup - undefined name 'ArrowStringDtype'

---
 pandas/tests/extension/test_string.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index e74938b544240..269ede7ca93ae 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -26,7 +26,7 @@
 
 
 def split_array(arr):
-    if not isinstance(arr.dtype, ArrowStringDtype):
+    if arr.dtype.storage != "pyarrow":
         pytest.skip("chunked array n/a")
 
     def _split_array(arr):

From 69a6cc1becd3f9198ad1b2967e1a5df0617051f9 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 28 Mar 2021 11:42:49 +0100
Subject: [PATCH 09/46] "StringDtype[storage]" -> "string[storage]" misc

---
 pandas/core/arrays/string_.py      | 4 ++--
 pandas/core/arrays/string_arrow.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index bcb8be5657a73..eadb3ef9e982c 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -141,9 +141,9 @@ def construct_from_string(cls, string):
         if string == "string":
             # TODO: use global default
             return cls()
-        elif string in {"string[python]", "StringDtype[python]"}:
+        elif string == "string[python]":
             return cls(storage="python")
-        elif string in {"string[pyarrow]", "StringDtype[pyarrow]"}:
+        elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
         else:
             raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 6d088d957b7d0..4bed6a067cea3 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -139,7 +139,7 @@ def _from_sequence_of_strings(
     @property
     def dtype(self) -> StringDtype:
         """
-        An instance of 'StringDtype[pyarrow]'.
+        An instance of 'string[pyarrow]'.
         """
         return self._dtype
 

From bd147ba28c92656bdaeb3f3c5f106101b0fec154 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 28 Mar 2021 12:31:06 +0100
Subject: [PATCH 10/46] __from_arrow__

---
 pandas/core/arrays/string_.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index eadb3ef9e982c..bdbb2827cd82d 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -182,27 +182,31 @@ def __repr__(self):
 
     def __from_arrow__(
         self, array: Union[pyarrow.Array, pyarrow.ChunkedArray]
-    ) -> ArrowStringArray:
+    ) -> StringArray | ArrowStringArray:
         """
         Construct StringArray from pyarrow Array/ChunkedArray.
         """
-        import pyarrow
+        if self.storage == "pyarrow":
+            from pandas.core.arrays.string_arrow import ArrowStringArray
 
-        from pandas.core.arrays.string_arrow import ArrowStringArray
-
-        if isinstance(array, pyarrow.Array):
-            chunks = [array]
+            return ArrowStringArray(array)
         else:
-            # pyarrow.ChunkedArray
-            chunks = array.chunks
 
-        results = []
-        for arr in chunks:
-            # using _from_sequence to ensure None is converted to NA
-            str_arr = StringArray._from_sequence(np.array(arr))
-            results.append(str_arr)
+            import pyarrow
+
+            if isinstance(array, pyarrow.Array):
+                chunks = [array]
+            else:
+                # pyarrow.ChunkedArray
+                chunks = array.chunks
+
+            results = []
+            for arr in chunks:
+                # using _from_sequence to ensure None is converted to NA
+                str_arr = StringArray._from_sequence(np.array(arr))
+                results.append(str_arr)
 
-        return ArrowStringArray._concat_same_type(results)
+            return StringArray._concat_same_type(results)
 
 
 class StringArray(PandasArray):

From 830275f1b8b6aefd2c8d0d13610ed58d0975249e Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 28 Mar 2021 14:52:13 +0100
Subject: [PATCH 11/46] more testing (wip)

---
 pandas/conftest.py                            | 20 +++++
 pandas/core/arrays/string_arrow.py            |  2 +-
 pandas/tests/arrays/string_/test_string.py    | 89 ++++++++++++++-----
 pandas/tests/dtypes/test_common.py            |  5 +-
 pandas/tests/dtypes/test_inference.py         |  4 +-
 pandas/tests/extension/base/casting.py        |  6 +-
 pandas/tests/frame/methods/test_astype.py     |  7 ++
 .../tests/frame/methods/test_combine_first.py | 10 ++-
 pandas/tests/frame/test_constructors.py       |  6 +-
 pandas/tests/tools/test_to_numeric.py         |  4 +-
 10 files changed, 113 insertions(+), 40 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index f3356d2998ff8..403b41a1c86b7 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1131,6 +1131,26 @@ def string_dtype(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        "string",
+        "string[python]",
+        pytest.param(
+            "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+        ),
+    ]
+)
+def nullable_string_dtype(request):
+    """
+    Parametrized fixture for string dtypes.
+
+    * 'string'
+    * 'string[python]'
+    * 'string[pyarrow]'
+    """
+    return request.param
+
+
 @pytest.fixture(params=tm.BYTES_DTYPES)
 def bytes_dtype(request):
     """
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 4bed6a067cea3..b68ae3dbe4218 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -96,7 +96,7 @@ class ArrowStringArray(OpsMixin, ExtensionArray):
     >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[arrow]")
     <ArrowStringArray>
     ['This is', 'some text', <NA>, 'data.']
-    Length: 4, dtype: arrow_string
+    Length: 4, dtype: string[pyarrow]
     """
 
     _dtype = StringDtype(storage="pyarrow")
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index b34c8d9722515..e224233b3bd13 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -14,9 +14,30 @@
 skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0")
 
 
-@pytest.fixture(params=["python", pytest.param("pyarrow", marks=skip_if_no_pyarrow)])
+def _is_pyarrow_dtype(dtype):
+    if isinstance(dtype, str):
+        if dtype == "string[pyarrow]":
+            return True
+    else:
+        if dtype.storage == "pyarrow":
+            return True
+    return False
+
+
+@pytest.fixture(
+    params=[
+        "string",
+        "string[python]",
+        pytest.param("string[pyarrow]", marks=skip_if_no_pyarrow),
+        pd.StringDtype(storage="python"),
+        pytest.param(
+            pd.StringDtype(storage="pyarrow"),
+            marks=skip_if_no_pyarrow,
+        ),
+    ]
+)
 def dtype(request):
-    return pd.StringDtype(storage=request.param)
+    return request.param
 
 
 @pytest.fixture(
@@ -30,7 +51,7 @@ def cls(request):
 
 
 def test_repr(dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         reason = (
             "AssertionError: assert '      A\n0     a\n1  None\n2     b' "
             "== '      A\n0     a\n1  <NA>\n2     b'"
@@ -91,7 +112,7 @@ def test_setitem_with_scalar_string(dtype):
     ],
 )
 def test_string_methods(input, method, dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         reason = "AttributeError: 'ArrowStringDtype' object has no attribute 'base'"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -106,7 +127,7 @@ def test_string_methods(input, method, dtype, request):
 
 
 def test_astype_roundtrip(dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         reason = "ValueError: Could not convert object to NumPy datetime"
         mark = pytest.mark.xfail(reason=reason, raises=ValueError)
         request.node.add_marker(mark)
@@ -127,7 +148,7 @@ def test_astype_roundtrip(dtype, request):
 
 
 def test_add(dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         reason = (
             "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' and "
             "'ArrowStringArray'"
@@ -155,7 +176,7 @@ def test_add(dtype, request):
 
 
 def test_add_2d(dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         reason = "Failed: DID NOT RAISE <class 'ValueError'>"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -171,7 +192,7 @@ def test_add_2d(dtype, request):
 
 
 def test_add_sequence(dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         reason = (
             "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' "
             "and 'list'"
@@ -192,7 +213,7 @@ def test_add_sequence(dtype, request):
 
 
 def test_mul(dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         reason = (
             "TypeError: unsupported operand type(s) for *: 'ArrowStringArray' and 'int'"
         )
@@ -275,7 +296,7 @@ def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, requ
 
 
 def test_comparison_methods_array(all_compare_operators, dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         if all_compare_operators in ["__eq__", "__ne__"]:
             reason = "NotImplementedError: Neither scalar nor ArrowStringArray"
         else:
@@ -346,7 +367,7 @@ def test_from_sequence_no_mutate(copy, cls, request):
 
 
 def test_astype_int(dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         reason = "TypeError: Cannot interpret 'Int64Dtype()' as a data type"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -358,9 +379,20 @@ def test_astype_int(dtype, request):
     tm.assert_extension_array_equal(result, expected)
 
 
-def test_astype_float(any_float_allowed_nullable_dtype):
+def test_astype_float(dtype, any_float_allowed_nullable_dtype, request):
     # Don't compare arrays (37974)
-    ser = pd.Series(["1.1", pd.NA, "3.3"], dtype="string")
+
+    if _is_pyarrow_dtype(dtype):
+        if any_float_allowed_nullable_dtype in {"Float32", "Float64"}:
+            reason = "TypeError: Cannot interpret 'Float32Dtype()' as a data type"
+        else:
+            reason = (
+                "TypeError: float() argument must be a string or a number, not 'NAType'"
+            )
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype)
 
     result = ser.astype(any_float_allowed_nullable_dtype)
     expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_allowed_nullable_dtype)
@@ -378,7 +410,7 @@ def test_reduce(skipna, dtype):
 @pytest.mark.parametrize("method", ["min", "max"])
 @pytest.mark.parametrize("skipna", [True, False])
 def test_min_max(method, skipna, dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -395,7 +427,7 @@ def test_min_max(method, skipna, dtype, request):
 @pytest.mark.parametrize("method", ["min", "max"])
 @pytest.mark.parametrize("box", [pd.Series, pd.array])
 def test_min_max_numpy(method, box, dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         if box is pd.array:
             reason = (
                 "TypeError: '<=' not supported between instances of 'str' and "
@@ -423,17 +455,25 @@ def test_reduce_missing(skipna, dtype):
         assert pd.isna(result)
 
 
-def test_fillna_args():
+def test_fillna_args(dtype, request):
     # GH 37987
 
-    arr = pd.array(["a", pd.NA], dtype="string")
+    if _is_pyarrow_dtype(dtype):
+        reason = (
+            "AssertionError: Regex pattern \"Cannot set non-string value '1' into "
+            "a StringArray.\" does not match 'Scalar must be NA or str'"
+        )
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    arr = pd.array(["a", pd.NA], dtype=dtype)
 
     res = arr.fillna(value="b")
-    expected = pd.array(["a", "b"], dtype="string")
+    expected = pd.array(["a", "b"], dtype=dtype)
     tm.assert_extension_array_equal(res, expected)
 
     res = arr.fillna(value=np.str_("b"))
-    expected = pd.array(["a", "b"], dtype="string")
+    expected = pd.array(["a", "b"], dtype=dtype)
     tm.assert_extension_array_equal(res, expected)
 
     msg = "Cannot set non-string value '1' into a StringArray."
@@ -449,7 +489,7 @@ def test_arrow_array(dtype):
     data = pd.array(["a", "b", "c"], dtype=dtype)
     arr = pa.array(data)
     expected = pa.array(list(data), type=pa.string(), from_pandas=True)
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         expected = pa.chunked_array(expected)
 
     assert arr.equals(expected)
@@ -465,14 +505,15 @@ def test_arrow_roundtrip(dtype):
     table = pa.table(df)
     assert table.field("a").type == "string"
     result = table.to_pandas()
-    assert isinstance(result["a"].dtype, type(dtype))
+    if not isinstance(dtype, str):
+        assert isinstance(result["a"].dtype, type(dtype))
     tm.assert_frame_equal(result, df)
     # ensure the missing value is represented by NA and not np.nan or None
     assert result.loc[2, "a"] is pd.NA
 
 
 def test_value_counts_na(dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         reason = "TypeError: boolean value of NA is ambiguous"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -488,7 +529,7 @@ def test_value_counts_na(dtype, request):
 
 
 def test_value_counts_with_normalize(dtype, request):
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         reason = "TypeError: boolean value of NA is ambiguous"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -525,7 +566,7 @@ def test_use_inf_as_na(values, expected, dtype):
 def test_memory_usage(dtype):
     # GH 33963
 
-    if dtype.storage == "pyarrow":
+    if _is_pyarrow_dtype(dtype):
         pytest.skip("not applicable")
 
     series = pd.Series(["a", "b", "c"], dtype=dtype)
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index 406aec9d4c16e..616f46624bfd7 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -281,7 +281,10 @@ def test_is_string_dtype():
     assert com.is_string_dtype(object)
     assert com.is_string_dtype(np.array(["a", "b"]))
     assert com.is_string_dtype(pd.StringDtype())
-    assert com.is_string_dtype(pd.array(["a", "b"], dtype="string"))
+
+
+def test_is_string_dtype_nullable(nullable_string_dtype):
+    assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype))
 
 
 integer_dtypes: List = []
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index b3c6015475674..907991b97ead1 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -1267,9 +1267,9 @@ def test_interval(self):
     @pytest.mark.parametrize("klass", [pd.array, Series])
     @pytest.mark.parametrize("skipna", [True, False])
     @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]])
-    def test_string_dtype(self, data, skipna, klass):
+    def test_string_dtype(self, data, skipna, klass, nullable_string_dtype):
         # StringArray
-        val = klass(data, dtype="string")
+        val = klass(data, dtype=nullable_string_dtype)
         inferred = lib.infer_dtype(val, skipna=skipna)
         assert inferred == "string"
 
diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
index 7c5ef5b3b27d3..47f4f7585243d 100644
--- a/pandas/tests/extension/base/casting.py
+++ b/pandas/tests/extension/base/casting.py
@@ -43,10 +43,10 @@ def test_astype_str(self, data):
         expected = pd.Series([str(x) for x in data[:5]], dtype=str)
         self.assert_series_equal(result, expected)
 
-    def test_astype_string(self, data):
+    def test_astype_string(self, data, nullable_string_dtype):
         # GH-33465
-        result = pd.Series(data[:5]).astype("string")
-        expected = pd.Series([str(x) for x in data[:5]], dtype="string")
+        result = pd.Series(data[:5]).astype(nullable_string_dtype)
+        expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype)
         self.assert_series_equal(result, expected)
 
     def test_to_numpy(self, data):
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index 161fe7990a327..c0b6e18e23847 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     Categorical,
@@ -567,6 +569,11 @@ def test_astype_empty_dtype_dict(self):
         "df",
         [
             DataFrame(Series(["x", "y", "z"], dtype="string")),
+            DataFrame(Series(["x", "y", "z"], dtype="string[python]")),
+            pytest.param(
+                DataFrame(Series(["x", "y", "z"], dtype="string[pyarrow]")),
+                marks=td.skip_if_no("pyarrow", min_version="1.0.0"),
+            ),
             DataFrame(Series(["x", "y", "z"], dtype="category")),
             DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])),
             DataFrame(Series(3 * [Interval(0, 1)])),
diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py
index b4d8a53e4b23f..dd91b32c8eb8c 100644
--- a/pandas/tests/frame/methods/test_combine_first.py
+++ b/pandas/tests/frame/methods/test_combine_first.py
@@ -381,15 +381,17 @@ def test_combine_first_with_asymmetric_other(self, val):
 
         tm.assert_frame_equal(res, exp)
 
-    def test_combine_first_string_dtype_only_na(self):
+    def test_combine_first_string_dtype_only_na(self, nullable_string_dtype):
         # GH: 37519
-        df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string")
-        df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string")
+        df = DataFrame(
+            {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
+        )
+        df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype)
         df.set_index(["a", "b"], inplace=True)
         df2.set_index(["a", "b"], inplace=True)
         result = df.combine_first(df2)
         expected = DataFrame(
-            {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string"
+            {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
         ).set_index(["a", "b"])
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index b76a44b3c86be..a62f2b0426911 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -1649,10 +1649,10 @@ def test_constructor_empty_with_string_dtype(self):
         df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5")
         tm.assert_frame_equal(df, expected)
 
-    def test_constructor_empty_with_string_extension(self):
+    def test_constructor_empty_with_string_extension(self, nullable_string_dtype):
         # GH 34915
-        expected = DataFrame(index=[], columns=["c1"], dtype="string")
-        df = DataFrame(columns=["c1"], dtype="string")
+        expected = DataFrame(index=[], columns=["c1"], dtype=nullable_string_dtype)
+        df = DataFrame(columns=["c1"], dtype=nullable_string_dtype)
         tm.assert_frame_equal(df, expected)
 
     def test_constructor_single_value(self):
diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py
index 65aa189a3e965..30d6436c7e250 100644
--- a/pandas/tests/tools/test_to_numeric.py
+++ b/pandas/tests/tools/test_to_numeric.py
@@ -725,9 +725,9 @@ def test_precision_float_conversion(strrep):
         (["1", "2", "3.5"], Series([1, 2, 3.5])),
     ],
 )
-def test_to_numeric_from_nullable_string(values, expected):
+def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected):
     # https://github.com/pandas-dev/pandas/issues/37262
-    s = Series(values, dtype="string")
+    s = Series(values, dtype=nullable_string_dtype)
     result = to_numeric(s)
     tm.assert_series_equal(result, expected)
 

From 214e524c3ed801450d926222db30afe5543675ef Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 28 Mar 2021 15:22:53 +0100
Subject: [PATCH 12/46] fix inference

---
 pandas/_libs/lib.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 94a4d586b4f13..c0979d165ba3c 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1110,6 +1110,8 @@ _TYPE_MAP = {
     "complex128": "complex",
     "c": "complex",
     "string": "string",
+    "string[python]": "string",
+    "string[pyarrow]": "string",
     "S": "bytes",
     "U": "string",
     "bool": "boolean",

From 5cfa97ac464f24c843d0aaf4d4fca6d950ea5776 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 1 Apr 2021 16:48:36 +0100
Subject: [PATCH 13/46] post-merge fixup

---
 pandas/tests/extension/json/array.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
index a63c849d25a9f..6c1161294dd17 100644
--- a/pandas/tests/extension/json/array.py
+++ b/pandas/tests/extension/json/array.py
@@ -39,7 +39,6 @@
     ExtensionDtype,
 )
 from pandas.api.types import is_bool_dtype
-from pandas.core.arrays.string_arrow import ArrowStringDtype
 
 
 class JSONDtype(ExtensionDtype):
@@ -195,7 +194,7 @@ def astype(self, dtype, copy=True):
             if copy:
                 return self.copy()
             return self
-        elif isinstance(dtype, (StringDtype, ArrowStringDtype)):
+        elif isinstance(dtype, StringDtype):
             value = self.astype(str)  # numpy doesn'y like nested dicts
             return dtype.construct_array_type()._from_sequence(value, copy=False)
 

From 74dbf96c9d2d077176a5f35620c654ee9bd19903 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 1 Apr 2021 17:48:06 +0100
Subject: [PATCH 14/46] remove changes to test_string_dtype - broken off in
 #40725

---
 pandas/_libs/lib.pyx                  | 2 --
 pandas/tests/dtypes/test_inference.py | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index b922489fedddc..646b5a05afcad 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1111,8 +1111,6 @@ _TYPE_MAP = {
     "complex128": "complex",
     "c": "complex",
     "string": "string",
-    "string[python]": "string",
-    "string[pyarrow]": "string",
     "S": "bytes",
     "U": "string",
     "bool": "boolean",
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index 907991b97ead1..b3c6015475674 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -1267,9 +1267,9 @@ def test_interval(self):
     @pytest.mark.parametrize("klass", [pd.array, Series])
     @pytest.mark.parametrize("skipna", [True, False])
     @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]])
-    def test_string_dtype(self, data, skipna, klass, nullable_string_dtype):
+    def test_string_dtype(self, data, skipna, klass):
         # StringArray
-        val = klass(data, dtype=nullable_string_dtype)
+        val = klass(data, dtype="string")
         inferred = lib.infer_dtype(val, skipna=skipna)
         assert inferred == "string"
 

From 3bda421aa3a26159ea799d25183d4d557226fdc4 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 15 Apr 2021 10:57:28 +0100
Subject: [PATCH 15/46] post merge fix-up

---
 pandas/conftest.py                 | 2 --
 pandas/core/arrays/string_arrow.py | 2 +-
 pandas/core/strings/accessor.py    | 3 +--
 pandas/tests/io/test_parquet.py    | 4 +---
 4 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 5ab52f4b1e7f5..25e001859c96a 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1146,8 +1146,6 @@ def nullable_string_dtype(request):
     * 'string[python]'
     * 'string[pyarrow]'
     """
-    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
     return request.param
 
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 26b7be9174d33..21ff40cb021aa 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -134,7 +134,7 @@ def _chk_pyarrow_available(cls) -> None:
     @classmethod
     def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
         cls._chk_pyarrow_available()
-        # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value
+        # convert non-na-likes to str, and nan-likes to StringDtype.na_value
         scalars = lib.ensure_string_array(scalars, copy=False)
         return cls(pa.array(scalars, type=pa.string(), from_pandas=True))
 
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 0b5613e302175..9a100aa4231b6 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -154,11 +154,10 @@ class StringMethods(NoNewAttributesMixin):
 
     def __init__(self, data):
         from pandas.core.arrays.string_ import StringDtype
-        from pandas.core.arrays.string_arrow import ArrowStringDtype
 
         self._inferred_dtype = self._validate(data)
         self._is_categorical = is_categorical_dtype(data.dtype)
-        self._is_string = isinstance(data.dtype, (StringDtype, ArrowStringDtype))
+        self._is_string = isinstance(data.dtype, StringDtype)
         self._data = data
 
         self._index = self._name = None
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 21ea2bd560060..631cc8bfc8ff7 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -839,9 +839,7 @@ def test_additional_extension_arrays(self, pa):
     @td.skip_if_no("pyarrow", min_version="1.0.0")
     def test_pyarrow_backed_string_array(self, pa):
         # test ArrowStringArray supported through the __arrow_array__ protocol
-        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
-        df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="arrow_string")})
+        df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
         check_round_trip(df, pa, expected=df)
 
     @td.skip_if_no("pyarrow", min_version="0.16.0")

From 523e24c0bcc70c202fd9cec139fcb0df0d62caed Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 15 Apr 2021 11:16:45 +0100
Subject: [PATCH 16/46] post merge fix-up

---
 pandas/tests/frame/methods/test_astype.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index 544960113fafc..322252d70a45e 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -569,9 +569,10 @@ def test_astype_empty_dtype_dict(self):
         "data, dtype",
         [
             (["x", "y", "z"], "string"),
+            (["x", "y", "z"], "string[python]"),
             pytest.param(
                 ["x", "y", "z"],
-                "arrow_string",
+                "string[pyarrow]",
                 marks=td.skip_if_no("pyarrow", min_version="1.0.0"),
             ),
             (["x", "y", "z"], "category"),
@@ -582,8 +583,6 @@ def test_astype_empty_dtype_dict(self):
     @pytest.mark.parametrize("errors", ["raise", "ignore"])
     def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors):
         # https://github.com/pandas-dev/pandas/issues/35471
-        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
         df = DataFrame(Series(data, dtype=dtype))
         if errors == "ignore":
             expected = df

From 279624cd1d489bf86653d3b588ad6bb66219050d Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 15 Apr 2021 15:19:40 +0100
Subject: [PATCH 17/46] revert some changes made for pre-commit checks.

---
 pandas/tests/arrays/string_/test_string_arrow.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index 7565e1aa0488b..b5d0627c8583c 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -35,14 +35,14 @@ def test_config():
 
 
 @pytest.mark.parametrize("chunked", [True, False])
-@pytest.mark.parametrize("np_or_pa", [np, pa])
-def test_constructor_not_string_type_raises(np_or_pa, chunked):
-    arr = np_or_pa.array([1, 2, 3])
+@pytest.mark.parametrize("array", [np, pa])
+def test_constructor_not_string_type_raises(array, chunked):
+    arr = array.array([1, 2, 3])
     if chunked:
-        if np_or_pa is np:
+        if array is np:
             pytest.skip("chunked not applicable to numpy array")
         arr = pa.chunked_array(arr)
-    if np_or_pa is np:
+    if array is np:
         msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowStringArray"
     else:
         msg = re.escape(

From c5ced5a1736cf34fb2814278ae65ad93c705c973 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 16 Apr 2021 13:05:23 +0100
Subject: [PATCH 18/46] post merge fix-up

---
 pandas/tests/arrays/string_/test_string_arrow.py | 2 +-
 pandas/tests/series/methods/test_astype.py       | 6 ++----
 pandas/tests/series/methods/test_update.py       | 4 ++--
 pandas/tests/strings/test_string_array.py        | 6 +++++-
 pandas/tests/strings/test_strings.py             | 2 +-
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index b5d0627c8583c..be89db9f25d20 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -4,7 +4,7 @@
 import pytest
 
 import pandas as pd
-import pandas.testing as tm
+import pandas._testing as tm
 
 pa = pytest.importorskip("pyarrow", minversion="1.0.0")
 
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
index bebe6948cff9c..157b76c630ef9 100644
--- a/pandas/tests/series/methods/test_astype.py
+++ b/pandas/tests/series/methods/test_astype.py
@@ -250,9 +250,10 @@ def test_td64_series_astype_object(self):
         "data, dtype",
         [
             (["x", "y", "z"], "string"),
+            (["x", "y", "z"], "string[python]"),
             pytest.param(
                 ["x", "y", "z"],
-                "arrow_string",
+                "string[pyarrow]",
                 marks=td.skip_if_no("pyarrow", min_version="1.0.0"),
             ),
             (["x", "y", "z"], "category"),
@@ -263,9 +264,6 @@ def test_td64_series_astype_object(self):
     @pytest.mark.parametrize("errors", ["raise", "ignore"])
     def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors):
         # https://github.com/pandas-dev/pandas/issues/35471
-
-        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
         ser = Series(data, dtype=dtype)
         if errors == "ignore":
             expected = ser
diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py
index 9a64877cb92ff..98cfb4cd6414d 100644
--- a/pandas/tests/series/methods/test_update.py
+++ b/pandas/tests/series/methods/test_update.py
@@ -11,7 +11,6 @@
     Timestamp,
 )
 import pandas._testing as tm
-from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
 
 
 class TestUpdate:
@@ -88,11 +87,12 @@ def test_update_from_non_series(self, series, other, expected):
         "data, other, expected, dtype",
         [
             (["a", None], [None, "b"], ["a", "b"], "string"),
+            (["a", None], [None, "b"], ["a", "b"], "string[python]"),
             pytest.param(
                 ["a", None],
                 [None, "b"],
                 ["a", "b"],
-                "arrow_string",
+                "string[pyarrow]",
                 marks=td.skip_if_no("pyarrow", min_version="1.0.0"),
             ),
             ([1, None], [None, 2], [1, 2], "Int64"),
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
index 02ccb3a930557..4bad3eb4fba47 100644
--- a/pandas/tests/strings/test_string_array.py
+++ b/pandas/tests/strings/test_string_array.py
@@ -134,5 +134,9 @@ def test_capitalize(input, method, nullable_string_dtype):
     result = method(a.str)
     expected = method(b.str)
 
-    assert result.dtype.name == nullable_string_dtype
+    if nullable_string_dtype == "string":
+        assert result.dtype.name == "string[python]"
+    else:
+        assert result.dtype.name == nullable_string_dtype
+
     tm.assert_series_equal(result.astype(object), expected)
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index a809446f0bc06..8b231c35282cd 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -139,7 +139,7 @@ def test_repeat():
 def test_repeat_with_null(nullable_string_dtype, request):
     # GH: 31632
 
-    if nullable_string_dtype == "arrow_string":
+    if nullable_string_dtype == "string[pyarrow]":
         reason = 'Attribute "dtype" are different'
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)

From 459812c335e5c34a461114c95a1e222eb260ad12 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 16 Apr 2021 13:26:09 +0100
Subject: [PATCH 19/46] undo unrelated changes

---
 pandas/core/arrays/base.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index c9dafb3714f7d..354e4cd765509 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -420,8 +420,6 @@ def __ne__(self, other: Any) -> ArrayLike:  # type: ignore[override]
         """
         Return for `self != other` (element-wise in-equality).
         """
-        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndex)):
-            return NotImplemented
         return ~(self == other)
 
     def to_numpy(
@@ -1052,9 +1050,9 @@ def take(
               from the right (the default). This is similar to
               :func:`numpy.take`.
 
-            * True: ``-1`` in `indices` indicate missing values.
-              These values are set to `fill_value`. Any other negative
-              value raises a ``ValueError``.
+            * True: negative values in `indices` indicate
+              missing values. These values are set to `fill_value`. Any other
+              other negative values raise a ``ValueError``.
 
         fill_value : any, optional
             Fill value to use for NA-indices when `allow_fill` is True.

From d707b6b96d7dee25fc14bc29945d48bdebd2b364 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 16 Apr 2021 13:54:28 +0100
Subject: [PATCH 20/46] undo changes to imports

---
 pandas/core/arrays/string_arrow.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 09004ab562422..73d0946217b52 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -9,8 +9,6 @@
 )
 
 import numpy as np
-import pyarrow as pa
-import pyarrow.compute as pc
 
 from pandas._libs import lib
 from pandas._typing import (
@@ -42,14 +40,25 @@
 )
 from pandas.core.strings.object_array import ObjectStringArrayMixin
 
-ARROW_CMP_FUNCS = {
-    "eq": pc.equal,
-    "ne": pc.not_equal,
-    "lt": pc.less,
-    "gt": pc.greater,
-    "le": pc.less_equal,
-    "ge": pc.greater_equal,
-}
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+else:
+    # PyArrow backed StringArrays are available starting at 1.0.0, but this
+    # file is imported from even if pyarrow is < 1.0.0, before pyarrow.compute
+    # and its compute functions existed. GH38801
+    if LooseVersion(pa.__version__) >= "1.0.0":
+        import pyarrow.compute as pc
+
+        ARROW_CMP_FUNCS = {
+            "eq": pc.equal,
+            "ne": pc.not_equal,
+            "lt": pc.less,
+            "gt": pc.greater,
+            "le": pc.less_equal,
+            "ge": pc.greater_equal,
+        }
 
 
 if TYPE_CHECKING:

From daaac062c08f31f879ed5f0166da2866c79e5e2b Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 17 Apr 2021 12:26:01 +0100
Subject: [PATCH 21/46] StringDtype.construct_array_type - add ref to issue

---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index a390487f8b191..dd06976f746c0 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -155,7 +155,7 @@ def __hash__(self) -> int:
         # custom __eq__ so have to override __hash__
         return super().__hash__()
 
-    # TODO: this is a classmethod, but we need to know the storage type.
+    # https://github.com/pandas-dev/pandas/issues/36126
     # error: Signature of "construct_array_type" incompatible with supertype
     # "ExtensionDtype"
     def construct_array_type(  # type: ignore[override]

From 42d382faa38cea74177d15ab7b86df6368d91a21 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 1 May 2021 13:18:27 +0100
Subject: [PATCH 22/46] post merge fixup

---
 asv_bench/benchmarks/algorithms.py         | 19 ++++----
 asv_bench/benchmarks/strings.py            |  8 +---
 pandas/core/arrays/interval.py             |  3 +-
 pandas/core/arrays/string_arrow.py         |  2 +-
 pandas/core/strings/accessor.py            |  3 +-
 pandas/tests/arrays/string_/test_string.py | 54 +++++++++-------------
 pandas/tests/strings/test_find_replace.py  |  8 ++--
 pandas/tests/strings/test_strings.py       |  8 ++--
 8 files changed, 43 insertions(+), 62 deletions(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index aecc609df574e..8885a0dcc781e 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -28,7 +28,7 @@ class Factorize:
             "datetime64[ns, tz]",
             "Int64",
             "boolean",
-            "string_arrow",
+            "string[pyarrow]",
         ],
     ]
     param_names = ["unique", "sort", "dtype"]
@@ -36,15 +36,12 @@ class Factorize:
     def setup(self, unique, sort, dtype):
         N = 10 ** 5
         string_index = tm.makeStringIndex(N)
-        try:
-            from pandas.core.arrays.string_arrow import ArrowStringDtype
-
-            string_arrow = pd.array(string_index, dtype=ArrowStringDtype())
-        except ImportError:
-            string_arrow = None
-
-        if dtype == "string_arrow" and not string_arrow:
-            raise NotImplementedError
+        string_arrow = None
+        if dtype == "string[pyarrow]":
+            try:
+                string_arrow = pd.array(string_index, dtype="string[pyarrow]")
+            except ImportError:
+                raise NotImplementedError
 
         data = {
             "int": pd.Int64Index(np.arange(N)),
@@ -57,7 +54,7 @@ def setup(self, unique, sort, dtype):
             ),
             "Int64": pd.array(np.arange(N), dtype="Int64"),
             "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
-            "string_arrow": string_arrow,
+            "string[pyarrow]": string_arrow,
         }[dtype]
         if not unique:
             data = data.repeat(5)
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 45a9053954569..5796b3f5440e7 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -50,12 +50,10 @@ def peakmem_cat_frame_construction(self, dtype):
 
 
 class Methods:
-    params = ["str", "string", "arrow_string"]
+    params = ["str", "string[python]", "string[pyarrow]"]
     param_names = ["dtype"]
 
     def setup(self, dtype):
-        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
         try:
             self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
         except ImportError:
@@ -213,12 +211,10 @@ def time_cat(self, other_cols, sep, na_rep, na_frac):
 
 class Contains:
 
-    params = (["str", "string", "arrow_string"], [True, False])
+    params = (["str", "string[python]", "string[pyarrow]"], [True, False])
     param_names = ["dtype", "regex"]
 
     def setup(self, dtype, regex):
-        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
         try:
             self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
         except ImportError:
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
index 50e8cc4c82e0d..8d3a8feb89d67 100644
--- a/pandas/core/arrays/interval.py
+++ b/pandas/core/arrays/interval.py
@@ -829,7 +829,6 @@ def astype(self, dtype, copy: bool = True):
         """
         from pandas import Index
         from pandas.core.arrays.string_ import StringDtype
-        from pandas.core.arrays.string_arrow import ArrowStringDtype
 
         if dtype is not None:
             dtype = pandas_dtype(dtype)
@@ -852,7 +851,7 @@ def astype(self, dtype, copy: bool = True):
             return self._shallow_copy(new_left, new_right)
         elif is_categorical_dtype(dtype):
             return Categorical(np.asarray(self), dtype=dtype)
-        elif isinstance(dtype, (StringDtype, ArrowStringDtype)):
+        elif isinstance(dtype, StringDtype):
             return dtype.construct_array_type()._from_sequence(self, copy=False)
 
         # TODO: This try/except will be repeated.
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 91aa808580ee7..9ead565c2d335 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -34,8 +34,8 @@
 from pandas.core import missing
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
-from pandas.core.arrays.string_ import StringDtype
 from pandas.core.arrays.boolean import BooleanDtype
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.indexers import (
     check_array_indexer,
     validate_indices,
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 3c7479b2e4aa8..9a100aa4231b6 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -3001,9 +3001,8 @@ def _result_dtype(arr):
     # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
     # when the list of values is empty.
     from pandas.core.arrays.string_ import StringDtype
-    from pandas.core.arrays.string_arrow import ArrowStringDtype
 
-    if isinstance(arr.dtype, (StringDtype, ArrowStringDtype)):
+    if isinstance(arr.dtype, StringDtype):
         return arr.dtype.name
     else:
         return object
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 2894370a55b1a..cbd7734c11017 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -17,21 +17,8 @@
 skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0")
 
 
-def _is_pyarrow_dtype(dtype):
-    if isinstance(dtype, str):
-        if dtype == "string[pyarrow]":
-            return True
-    else:
-        if dtype.storage == "pyarrow":
-            return True
-    return False
-
-
 @pytest.fixture(
     params=[
-        "string",
-        "string[python]",
-        pytest.param("string[pyarrow]", marks=skip_if_no_pyarrow),
         pd.StringDtype(storage="python"),
         pytest.param(
             pd.StringDtype(storage="pyarrow"),
@@ -58,12 +45,15 @@ def test_repr(dtype):
     expected = "      A\n0     a\n1  <NA>\n2     b"
     assert repr(df) == expected
 
-    dtype_name = "pyarrow" if _is_pyarrow_dtype(dtype) else "python"
-    expected = f"0       a\n1    <NA>\n2       b\nName: A, dtype: string[{dtype_name}]"
+    expected = (
+        f"0       a\n1    <NA>\n2       b\nName: A, dtype: string[{dtype.storage}]"
+    )
     assert repr(df.A) == expected
 
-    arr_name = "ArrowStringArray" if _is_pyarrow_dtype(dtype) else "StringArray"
-    expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string[{dtype_name}]"
+    arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray"
+    expected = (
+        f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string[{dtype.storage}]"
+    )
     assert repr(df.A.array) == expected
 
 
@@ -101,7 +91,7 @@ def test_setitem_with_scalar_string(dtype):
 
 
 def test_astype_roundtrip(dtype, request):
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         reason = "ValueError: Could not convert object to NumPy datetime"
         mark = pytest.mark.xfail(reason=reason, raises=ValueError)
         request.node.add_marker(mark)
@@ -122,7 +112,7 @@ def test_astype_roundtrip(dtype, request):
 
 
 def test_add(dtype, request):
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         reason = (
             "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' and "
             "'ArrowStringArray'"
@@ -150,7 +140,7 @@ def test_add(dtype, request):
 
 
 def test_add_2d(dtype, request):
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         reason = "Failed: DID NOT RAISE <class 'ValueError'>"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -166,7 +156,7 @@ def test_add_2d(dtype, request):
 
 
 def test_add_sequence(dtype, request):
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         reason = (
             "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' "
             "and 'list'"
@@ -187,7 +177,7 @@ def test_add_sequence(dtype, request):
 
 
 def test_mul(dtype, request):
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         reason = (
             "TypeError: unsupported operand type(s) for *: 'ArrowStringArray' and 'int'"
         )
@@ -270,7 +260,7 @@ def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, requ
 
 
 def test_comparison_methods_array(all_compare_operators, dtype, request):
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         if all_compare_operators in ["__eq__", "__ne__"]:
             reason = "NotImplementedError: Neither scalar nor ArrowStringArray"
         else:
@@ -341,7 +331,7 @@ def test_from_sequence_no_mutate(copy, cls, request):
 
 
 def test_astype_int(dtype, request):
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         reason = "TypeError: Cannot interpret 'Int64Dtype()' as a data type"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -356,7 +346,7 @@ def test_astype_int(dtype, request):
 def test_astype_float(dtype, any_float_allowed_nullable_dtype, request):
     # Don't compare arrays (37974)
 
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         if any_float_allowed_nullable_dtype in {"Float32", "Float64"}:
             reason = "TypeError: Cannot interpret 'Float32Dtype()' as a data type"
         else:
@@ -384,7 +374,7 @@ def test_reduce(skipna, dtype):
 @pytest.mark.parametrize("method", ["min", "max"])
 @pytest.mark.parametrize("skipna", [True, False])
 def test_min_max(method, skipna, dtype, request):
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
@@ -401,7 +391,7 @@ def test_min_max(method, skipna, dtype, request):
 @pytest.mark.parametrize("method", ["min", "max"])
 @pytest.mark.parametrize("box", [pd.Series, pd.array])
 def test_min_max_numpy(method, box, dtype, request):
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         if box is pd.array:
             reason = (
                 "TypeError: '<=' not supported between instances of 'str' and "
@@ -432,7 +422,7 @@ def test_reduce_missing(skipna, dtype):
 def test_fillna_args(dtype, request):
     # GH 37987
 
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         reason = (
             "AssertionError: Regex pattern \"Cannot set non-string value '1' into "
             "a StringArray.\" does not match 'Scalar must be NA or str'"
@@ -463,7 +453,7 @@ def test_arrow_array(dtype):
     data = pd.array(["a", "b", "c"], dtype=dtype)
     arr = pa.array(data)
     expected = pa.array(list(data), type=pa.string(), from_pandas=True)
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         expected = pa.chunked_array(expected)
 
     assert arr.equals(expected)
@@ -487,7 +477,7 @@ def test_arrow_roundtrip(dtype):
 
 
 @td.skip_if_no("pyarrow", min_version="0.15.1.dev")
-def test_arrow_load_from_zero_chunks(dtype, dtype_object):
+def test_arrow_load_from_zero_chunks(dtype):
     # GH-41040
     import pyarrow as pa
 
@@ -498,7 +488,7 @@ def test_arrow_load_from_zero_chunks(dtype, dtype_object):
     # Instantiate the same table with no chunks at all
     table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
     result = table.to_pandas()
-    assert isinstance(result["a"].dtype, dtype_object)
+    assert isinstance(result["a"].dtype, type(dtype))
     tm.assert_frame_equal(result, df)
 
 
@@ -546,7 +536,7 @@ def test_use_inf_as_na(values, expected, dtype):
 def test_memory_usage(dtype):
     # GH 33963
 
-    if _is_pyarrow_dtype(dtype):
+    if dtype.storage == "pyarrow":
         pytest.skip("not applicable")
 
     series = pd.Series(["a", "b", "c"], dtype=dtype)
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index 0c54042d983ad..99f1196ac89a9 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -18,8 +18,9 @@
     params=[
         "object",
         "string",
+        "string[python]",
         pytest.param(
-            "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+            "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
         ),
     ]
 )
@@ -28,10 +29,9 @@ def any_string_dtype(request):
     Parametrized fixture for string dtypes.
     * 'object'
     * 'string'
-    * 'arrow_string'
+    * 'string[python]'
+    * 'string[pyarrow]'
     """
-    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
     return request.param
 
 
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index f218d5333b415..b8603ae71a0b1 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -23,8 +23,9 @@
     params=[
         "object",
         "string",
+        "string[python]",
         pytest.param(
-            "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+            "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
         ),
     ]
 )
@@ -33,10 +34,9 @@ def any_string_dtype(request):
     Parametrized fixture for string dtypes.
     * 'object'
     * 'string'
-    * 'arrow_string'
+    * 'string[python]'
+    * 'string[pyarrow]'
     """
-    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
     return request.param
 
 

From 4fb1a0db22f2aa98b54383089a76e87f50c173c1 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 1 May 2021 13:24:09 +0100
Subject: [PATCH 23/46] add draft release note

---
 doc/source/whatsnew/v1.3.0.rst | 50 ++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index b2f4de22ca5c1..7ec7cb80db463 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -164,6 +164,56 @@ a copy will no longer be made (:issue:`32960`)
 The default behavior when not passing ``copy`` will remain unchanged, i.e.
 a copy will be made.
 
+.. _whatsnew_130.arrow_string:
+
+PyArrow backed string data type
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We've enhanced the :class:`StringDtype`, an extension type dedicated to string data.
+(:issue:`39908`)
+
+It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`, use
+pandas options or specify the dtype using ``dtype='string[pyarrow]'``
+
+.. warning::
+
+   ``string[pyarrow]`` is currently considered experimental. The implementation
+   and parts of the API may change without warning.
+
+The ``'string[pyarrow]'`` extension type solves several issues with NumPy backed arrays:
+
+1. 
+2. 
+3. 
+
+
+.. ipython:: python
+
+   pd.Series(['abc', None, 'def'], dtype=pd.StringDtype(storage="pyarrow"))
+
+You can use the alias ``"string[pyarrow]"`` as well.
+
+.. ipython:: python
+
+   s = pd.Series(['abc', None, 'def'], dtype="string[pyarrow]")
+   s
+
+The usual string accessor methods work. Where appropriate, the return type
+of the Series or columns of a DataFrame will also have string dtype.
+
+.. ipython:: python
+
+   s.str.upper()
+   s.str.split('b', expand=True).dtypes
+
+String accessor methods returning integers will return a value with :class:`Int64Dtype`
+
+.. ipython:: python
+
+   s.str.count("a")
+
+See :ref:`text.types` for more.
+
 Centered Datetime-Like Rolling Windows
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

From 15efb2e57f9e52e7076b7c492ad79ca72839d1c8 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 12 May 2021 12:45:53 +0100
Subject: [PATCH 24/46] post merge fix-up

---
 doc/source/whatsnew/v1.3.0.rst         | 6 +++---
 pandas/conftest.py                     | 8 ++++----
 pandas/tests/extension/base/casting.py | 5 ++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 9b209c2b46efe..d71bf046ba9ae 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -184,9 +184,9 @@ pandas options or specify the dtype using ``dtype='string[pyarrow]'``
 
 The ``'string[pyarrow]'`` extension type solves several issues with NumPy backed arrays:
 
-1. 
-2. 
-3. 
+1.
+2.
+3.
 
 
 .. ipython:: python
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 1f7a194b60ac5..b859a35c484e1 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1157,8 +1157,9 @@ def object_dtype(request):
     params=[
         "object",
         "string",
+        "string[python]",
         pytest.param(
-            "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+            "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
         ),
     ]
 )
@@ -1167,10 +1168,9 @@ def any_string_dtype(request):
     Parametrized fixture for string dtypes.
     * 'object'
     * 'string'
-    * 'arrow_string'
+    * 'string[python]'
+    * 'string[pyarrow]'
     """
-    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
     return request.param
 
 
diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
index 99a5666926e10..ed1047f6e28f6 100644
--- a/pandas/tests/extension/base/casting.py
+++ b/pandas/tests/extension/base/casting.py
@@ -49,15 +49,14 @@ def test_astype_str(self, data):
         "nullable_string_dtype",
         [
             "string",
+            "string[python]",
             pytest.param(
-                "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+                "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
             ),
         ],
     )
     def test_astype_string(self, data, nullable_string_dtype):
         # GH-33465
-        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
         result = pd.Series(data[:5]).astype(nullable_string_dtype)
         expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype)
         self.assert_series_equal(result, expected)

From b53cfe015f6ad4e60b2f0cc847bdc75c40e5a436 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 12 May 2021 14:23:03 +0100
Subject: [PATCH 25/46] docstrings

---
 pandas/core/arrays/string_.py      |  6 +++---
 pandas/core/arrays/string_arrow.py |  2 +-
 pandas/core/generic.py             | 14 +++++++-------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index d08bbed7c9c75..1cb5eade8a20c 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -80,7 +80,7 @@ class StringDtype(ExtensionDtype):
     Examples
     --------
     >>> pd.StringDtype()
-    StringDtype
+    string[python]
     """
 
     #: StringDtype.na_value uses pandas.NA
@@ -263,7 +263,7 @@ class StringArray(PandasArray):
     >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
     <StringArray>
     ['This is', 'some text', <NA>, 'data.']
-    Length: 4, dtype: string
+    Length: 4, dtype: string[python]
 
     Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
     will convert the values to strings.
@@ -275,7 +275,7 @@ class StringArray(PandasArray):
     >>> pd.array(['1', 1], dtype="string")
     <StringArray>
     ['1', '1']
-    Length: 2, dtype: string
+    Length: 2, dtype: string[python]
 
     However, instantiating StringArrays directly with non-strings will raise an error.
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index c3aa44a438e1f..9036bfde7537e 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -117,7 +117,7 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin):
 
     Examples
     --------
-    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[arrow]")
+    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
     <ArrowStringArray>
     ['This is', 'some text', <NA>, 'data.']
     Length: 4, dtype: string[pyarrow]
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 0d39f13afc426..fef2c1f46200d 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -6134,12 +6134,12 @@ def convert_dtypes(
         2  3  z   <NA>  <NA>    20  200.0
 
         >>> dfn.dtypes
-        a      Int32
-        b     string
-        c    boolean
-        d     string
-        e      Int64
-        f    Float64
+        a              Int32
+        b     string[python]
+        c            boolean
+        d     string[python]
+        e              Int64
+        f            Float64
         dtype: object
 
         Start with a Series of strings and missing data represented by ``np.nan``.
@@ -6157,7 +6157,7 @@ def convert_dtypes(
         0       a
         1       b
         2    <NA>
-        dtype: string
+        dtype: string[python]
         """
         if self.ndim == 1:
             return self._convert_dtypes(

From b7db53f99305f5d54ccb0db2674fac506ce067dc Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 12 May 2021 14:28:48 +0100
Subject: [PATCH 26/46] benchmarks

---
 asv_bench/benchmarks/algos/isin.py | 8 +++-----
 asv_bench/benchmarks/strings.py    | 4 +---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py
index 44245295beafc..4b58981694014 100644
--- a/asv_bench/benchmarks/algos/isin.py
+++ b/asv_bench/benchmarks/algos/isin.py
@@ -25,8 +25,8 @@ class IsIn:
         "category[object]",
         "category[int]",
         "str",
-        "string",
-        "arrow_string",
+        "string[python]",
+        "string[pyarrow]",
     ]
     param_names = ["dtype"]
 
@@ -62,9 +62,7 @@ def setup(self, dtype):
             self.values = np.random.choice(arr, sample_size)
             self.series = Series(arr).astype("category")
 
-        elif dtype in ["str", "string", "arrow_string"]:
-            from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
+        elif dtype in ["str", "string[python]", "string[pyarrow]"]:
             try:
                 self.series = Series(tm.makeStringIndex(N), dtype=dtype)
             except ImportError:
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 47cc9fcf568ee..02cbff7a1559c 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -226,12 +226,10 @@ def time_contains(self, dtype, regex):
 
 class Split:
 
-    params = (["str", "string", "arrow_string"], [True, False])
+    params = (["str", "string[python]", "string[pyarrow]"], [True, False])
     param_names = ["dtype", "expand"]
 
     def setup(self, dtype, expand):
-        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
         try:
             self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("--")
         except ImportError:

From 3399f08727ee04d01141e6d2244e4aad0ad54799 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 12 May 2021 14:33:21 +0100
Subject: [PATCH 27/46] pyarrow min

---
 pandas/tests/extension/arrow/test_string.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py
index f32f1e415ddc7..67a62978aa1bc 100644
--- a/pandas/tests/extension/arrow/test_string.py
+++ b/pandas/tests/extension/arrow/test_string.py
@@ -2,7 +2,7 @@
 
 import pandas as pd
 
-pytest.importorskip("pyarrow", minversion="0.13.0")
+pytest.importorskip("pyarrow", minversion="1.0.0")
 
 
 def test_constructor_from_list():

From 71d1e6c6bbb4f7f2ee28954000a5903a678a7228 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 26 May 2021 17:45:48 +0100
Subject: [PATCH 28/46] post merge fixup

---
 asv_bench/benchmarks/strings.py  | 4 +---
 pandas/tests/strings/test_api.py | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 2e109e59c1c6d..32fbf4e6c7de3 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -12,12 +12,10 @@
 
 
 class Dtypes:
-    params = ["str", "string", "arrow_string"]
+    params = ["str", "string[python]", "string[pyarrow]"]
     param_names = ["dtype"]
 
     def setup(self, dtype):
-        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
-
         try:
             self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
         except ImportError:
diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py
index ec8b5bfa11ad5..c0ae06802bdb1 100644
--- a/pandas/tests/strings/test_api.py
+++ b/pandas/tests/strings/test_api.py
@@ -128,7 +128,7 @@ def test_api_per_method(
 def test_api_for_categorical(any_string_method, any_string_dtype, request):
     # https://github.com/pandas-dev/pandas/issues/10661
 
-    if any_string_dtype == "arrow_string":
+    if any_string_dtype == "string[pyarrow]":
         # unsupported operand type(s) for +: 'ArrowStringArray' and 'str'
         mark = pytest.mark.xfail(raises=TypeError, reason="Not Implemented")
         request.node.add_marker(mark)

From 9e23c35a3e84024b9c50b056d2481e2b76a477c1 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 26 May 2021 21:26:19 +0100
Subject: [PATCH 29/46] misc clean

---
 pandas/core/arrays/string_.py              | 1 -
 pandas/tests/arrays/string_/test_string.py | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 1cb5eade8a20c..866210a07ca0f 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -138,7 +138,6 @@ def construct_from_string(cls, string):
                 f"'construct_from_string' expects a string, got {type(string)}"
             )
         if string == "string":
-            # TODO: use global default
             return cls()
         elif string == "string[python]":
             return cls(storage="python")
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index ad708350d3251..ae2fedef1d947 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -460,8 +460,7 @@ def test_arrow_roundtrip(dtype):
     table = pa.table(df)
     assert table.field("a").type == "string"
     result = table.to_pandas()
-    if not isinstance(dtype, str):
-        assert isinstance(result["a"].dtype, type(dtype))
+    assert isinstance(result["a"].dtype, type(dtype))
     tm.assert_frame_equal(result, df)
     # ensure the missing value is represented by NA and not np.nan or None
     assert result.loc[2, "a"] is pd.NA

From 64b3206a66efad4a1bb5bb9d8287bfc81adc94d0 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 27 May 2021 16:47:43 +0100
Subject: [PATCH 30/46] update construct_from_string docstring

---
 pandas/core/arrays/string_.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 591de9b9096cd..aafd34dcf7a13 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -115,13 +115,13 @@ def construct_from_string(cls, string):
             The type of the name. The storage type will be taking from `string`.
             Valid options and their storage types are
 
-            ========================== ==============
+            ========================== ==============================================
             string                     result storage
-            ========================== ==============
-            ``'string'``               global default
+            ========================== ==============================================
+            ``'string'``               pd.options.mode.string_storage, default python
             ``'string[python]'``       python
             ``'string[pyarrow]'``      pyarrow
-            ========================== =============
+            ========================== ==============================================
 
         Returns
         -------

From d83a4ff42fd8fd535bec2b8cd01bcfcf638f24c1 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 27 May 2021 16:57:49 +0100
Subject: [PATCH 31/46] update whatsnew for dtype="string"

---
 doc/source/whatsnew/v1.3.0.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index aa032c0c4fb70..012a524321f56 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -204,6 +204,14 @@ You can use the alias ``"string[pyarrow]"`` as well.
    s = pd.Series(['abc', None, 'def'], dtype="string[pyarrow]")
    s
 
+You can also create a PyArrow backed string array using pandas options.
+
+.. ipython:: python
+
+    with pd.option_context("string_storage", "pyarrow"):
+        s = pd.Series(['abc', None, 'def'], dtype="string")
+    s
+
 The usual string accessor methods work. Where appropriate, the return type
 of the Series or columns of a DataFrame will also have string dtype.
 

From aef11626d8f5de98566b4aeb8ec7032f718b1434 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 30 May 2021 14:42:27 +0100
Subject: [PATCH 32/46] update release note

---
 doc/source/whatsnew/v1.3.0.rst | 72 ++++++++++++++++++++++++++++------
 1 file changed, 61 insertions(+), 11 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 8dc6dc52d2687..83680eacb9912 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -179,20 +179,16 @@ We've enhanced the :class:`StringDtype`, an extension type dedicated to string d
 (:issue:`39908`)
 
 It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`, use
-pandas options or specify the dtype using ``dtype='string[pyarrow]'``
+pandas options or specify the dtype using ``dtype='string[pyarrow]'`` to allow the
+StringArray to be backed by a PyArrow array instead of a NumPy array of Python objects.
+
+The PyArrow backed StringArray requires pyarrow 1.0.0 or greater to be installed.
 
 .. warning::
 
    ``string[pyarrow]`` is currently considered experimental. The implementation
    and parts of the API may change without warning.
 
-The ``'string[pyarrow]'`` extension type solves several issues with NumPy backed arrays:
-
-1.
-2.
-3.
-
-
 .. ipython:: python
 
    pd.Series(['abc', None, 'def'], dtype=pd.StringDtype(storage="pyarrow"))
@@ -212,8 +208,8 @@ You can also create a PyArrow backed string array using pandas options.
         s = pd.Series(['abc', None, 'def'], dtype="string")
     s
 
-The usual string accessor methods work. Where appropriate, the return type
-of the Series or columns of a DataFrame will also have string dtype.
+The usual string accessor methods work. Where appropriate, the return type of the Series
+or columns of a DataFrame will also have string dtype.
 
 .. ipython:: python
 
@@ -226,7 +222,61 @@ String accessor methods returning integers will return a value with :class:`Int6
 
    s.str.count("a")
 
-See :ref:`text.types` for more.
+Some string accessor methods use native PyArrow string kernels operating directly on the
+PyArrow memory, others fallback to converting to a NumPy array of Python objects and
+using the native Python string functions. String methods using Pyarrow kernels are
+generally much more performant.
+
+Some PyArrow string kernels are implemented in later versions of pyarrow that the
+minimum version required to create a PyArrow backed StringArray. In these cases, the
+string accessor will fall back to the Python implementations.
+
+Some string accessor methods accept arguments controlling their behaviour which are not
+supported by the PyArrow kernels. These cases will also fall back to object mode.
+
++--------------------------------+----------+------------------------------------------+
+| Accessor                       | Minimum  | Limitations (otherwise fall back to      |
+| Method                         | PyArrow  | object mode)                             |
+|                                | Version  |                                          |
++================================+==========+==========================================+
+| :meth:`~Series.str.contains`   | 1.0.0    | The ``flags`` argument is not supported. |
+|                                |          | If ``regex=True``, pyarrow 4.0.0 is      |
+|                                |          | required and ``case=False`` is not       |
+|                                |          | supported.                               |
++--------------------------------+----------+------------------------------------------+
+| :meth:`~Series.str.startswith` | 4.0.0    |                                          |
+| :meth:`~Series.str.endswith`   |          |                                          |
++--------------------------------+----------+------------------------------------------+
+| :meth:`~Series.str.replace`    | 4.0.0    | The ``flags`` argument, ``case=False``,  |
+|                                |          | passing a callable for the ``repr``      |
+|                                |          | argument or passing a compiled regex is  |
+|                                |          | not supported.                           |
++--------------------------------+----------+------------------------------------------+
+| :meth:`~Series.str.match`      | 4.0.0    |                                          |
+| :meth:`~Series.str.fullmatch`  |          |                                          |
++--------------------------------+----------+------------------------------------------+
+| :meth:`~Series.str.isalnum`    | 1.0.0    |                                          |
+| :meth:`~Series.str.isalpha`    |          |                                          |
+| :meth:`~Series.str.isdecimal`  |          |                                          |
+| :meth:`~Series.str.isdigit`    |          |                                          |
+| :meth:`~Series.str.islower`    |          |                                          |
+| :meth:`~Series.str.isnumeric`  |          |                                          |
+| :meth:`~Series.str.istitle`    |          |                                          |
+| :meth:`~Series.str.isupper`    |          |                                          |
++--------------------------------+----------+------------------------------------------+
+| :meth:`~Series.str.isspace`    | 2.0.0    |                                          |
++--------------------------------+----------+------------------------------------------+
+| :meth:`~Series.str.len`        | 4.0.0    |                                          |
++--------------------------------+----------+------------------------------------------+
+| :meth:`~Series.str.lower`      | 1.0.0    |                                          |
+| :meth:`~Series.str.upper`      |          |                                          |
++--------------------------------+----------+------------------------------------------+
+| :meth:`~Series.str.strip`      | 4.0.0    |                                          |
+| :meth:`~Series.str.lstrip`     |          |                                          |
+| :meth:`~Series.str.rstrip`     |          |                                          |
++--------------------------------+----------+------------------------------------------+
+
+
 
 Centered Datetime-Like Rolling Windows
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

From 6247a5b2dfddab8497f1874911b20e6272985fb6 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 30 May 2021 16:49:30 +0100
Subject: [PATCH 33/46] paramertize test for df.convert_dtypes()

---
 .../tests/frame/methods/test_convert_dtypes.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py
index dd7bf0aada449..2d3f0011617fc 100644
--- a/pandas/tests/frame/methods/test_convert_dtypes.py
+++ b/pandas/tests/frame/methods/test_convert_dtypes.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 import pandas._testing as tm
 
@@ -9,7 +11,16 @@ class TestConvertDtypes:
     @pytest.mark.parametrize(
         "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
     )
-    def test_convert_dtypes(self, convert_integer, expected):
+    @pytest.mark.parametrize(
+        "string_storage",
+        [
+            "python",
+            pytest.param(
+                "pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+            ),
+        ],
+    )
+    def test_convert_dtypes(self, convert_integer, expected, string_storage):
         # Specific types are tested in tests/series/test_dtypes.py
         # Just check that it works for DataFrame here
         df = pd.DataFrame(
@@ -18,11 +29,12 @@ def test_convert_dtypes(self, convert_integer, expected):
                 "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
             }
         )
-        result = df.convert_dtypes(True, True, convert_integer, False)
+        with pd.option_context("string_storage", string_storage):
+            result = df.convert_dtypes(True, True, convert_integer, False)
         expected = pd.DataFrame(
             {
                 "a": pd.Series([1, 2, 3], dtype=expected),
-                "b": pd.Series(["x", "y", "z"], dtype="string"),
+                "b": pd.Series(["x", "y", "z"], dtype=f"string[{string_storage}]"),
             }
         )
         tm.assert_frame_equal(result, expected)

From a6d066ca43f44879f4a01c74c805b2bf4b0790b7 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 31 May 2021 13:20:33 +0100
Subject: [PATCH 34/46] fixup pd.array and more testing of string_storage
 option

---
 pandas/conftest.py                            | 16 ++++
 pandas/core/arrays/string_.py                 | 11 +--
 pandas/core/arrays/string_arrow.py            |  5 ++
 pandas/core/construction.py                   | 25 +++++--
 .../tests/arrays/string_/test_string_arrow.py | 74 ++++++++++++++++---
 pandas/tests/arrays/test_array.py             | 23 ++++--
 pandas/tests/arrays/test_datetimelike.py      | 21 +++---
 pandas/tests/series/methods/test_astype.py    | 22 +++++-
 pandas/tests/strings/test_api.py              |  5 +-
 9 files changed, 164 insertions(+), 38 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index b859a35c484e1..a880fcc08e0fe 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1131,6 +1131,22 @@ def nullable_string_dtype(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        "python",
+        pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")),
+    ]
+)
+def string_storage(request):
+    """
+    Parametrized fixture for pd.options.mode.string_storage.
+
+    * 'python'
+    * 'pyarrow'
+    """
+    return request.param
+
+
 @pytest.fixture(params=tm.BYTES_DTYPES)
 def bytes_dtype(request):
     """
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index aafd34dcf7a13..c80c1263c3bf8 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -295,7 +295,7 @@ def __init__(self, values, copy=False):
         super().__init__(values, copy=copy)
         # error: Incompatible types in assignment (expression has type "StringDtype",
         # variable has type "PandasDtype")
-        NDArrayBacked.__init__(self, self._ndarray, StringDtype())
+        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
         if not isinstance(values, type(self)):
             self._validate()
 
@@ -311,8 +311,9 @@ def _validate(self):
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
-        if dtype:
-            assert dtype == "string"
+        if dtype and not (isinstance(dtype, str) and dtype == "string"):
+            dtype = pandas_dtype(dtype)
+            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
 
         from pandas.core.arrays.masked import BaseMaskedArray
 
@@ -332,7 +333,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
         new_string_array = cls.__new__(cls)
-        NDArrayBacked.__init__(new_string_array, result, StringDtype())
+        NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
 
         return new_string_array
 
@@ -501,7 +502,7 @@ def _str_map(
         from pandas.arrays import BooleanArray
 
         if dtype is None:
-            dtype = StringDtype()
+            dtype = StringDtype(storage="python")
         if na_value is None:
             na_value = self.dtype.na_value
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index b0a36b7c02644..9d4992f6f04a8 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -35,6 +35,7 @@
     is_object_dtype,
     is_scalar,
     is_string_dtype,
+    pandas_dtype,
 )
 from pandas.core.dtypes.missing import isna
 
@@ -154,6 +155,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
 
         cls._chk_pyarrow_available()
 
+        if dtype and not (isinstance(dtype, str) and dtype == "string"):
+            dtype = pandas_dtype(dtype)
+            assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
+
         if isinstance(scalars, BaseMaskedArray):
             # avoid costly conversion to object dtype in ensure_string_array and
             # numerical issues with Float32Dtype
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index b05bc895d0081..47e482b849095 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -113,18 +113,22 @@ def array(
 
         Currently, pandas will infer an extension dtype for sequences of
 
-        ============================== =====================================
+        ============================== =======================================
         Scalar Type                    Array Type
-        ============================== =====================================
+        ============================== =======================================
         :class:`pandas.Interval`       :class:`pandas.arrays.IntervalArray`
         :class:`pandas.Period`         :class:`pandas.arrays.PeriodArray`
         :class:`datetime.datetime`     :class:`pandas.arrays.DatetimeArray`
         :class:`datetime.timedelta`    :class:`pandas.arrays.TimedeltaArray`
         :class:`int`                   :class:`pandas.arrays.IntegerArray`
         :class:`float`                 :class:`pandas.arrays.FloatingArray`
-        :class:`str`                   :class:`pandas.arrays.StringArray`
+        :class:`str`                   :class:`pandas.arrays.StringArray` or
+                                       :class:`pandas.arrays.ArrowStringArray`
         :class:`bool`                  :class:`pandas.arrays.BooleanArray`
-        ============================== =====================================
+        ============================== =======================================
+
+        The ExtensionArray created when the scalar type is :class:`str` is determined by
+        pd.options.mode.string_storage if the dtype is not explicitly given.
 
         For all other cases, NumPy's usual inference rules will be used.
 
@@ -240,6 +244,14 @@ def array(
     ['a', <NA>, 'c']
     Length: 3, dtype: string[python]
 
+    >>> with pd.option_context("string_storage", "pyarrow"):
+    ...     arr = pd.array(["a", None, "c"])
+    ...
+    >>> arr
+    <ArrowStringArray>
+    ['a', <NA>, 'c']
+    Length: 3, dtype: string[pyarrow]
+
     >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
     <PeriodArray>
     ['2000-01-01', '2000-01-01']
@@ -292,10 +304,10 @@ def array(
         IntegerArray,
         IntervalArray,
         PandasArray,
-        StringArray,
         TimedeltaArray,
         period_array,
     )
+    from pandas.core.arrays.string_ import StringDtype
 
     if lib.is_scalar(data):
         msg = f"Cannot pass scalar '{data}' to 'pandas.array'."
@@ -345,7 +357,8 @@ def array(
             return TimedeltaArray._from_sequence(data, copy=copy)
 
         elif inferred_dtype == "string":
-            return StringArray._from_sequence(data, copy=copy)
+            # StringArray/ArrowStringArray depending on pd.options.mode.string_storage
+            return StringDtype().construct_array_type()._from_sequence(data, copy=copy)
 
         elif inferred_dtype == "integer":
             return IntegerArray._from_sequence(data, copy=copy)
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index be89db9f25d20..bb16754182c87 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -8,27 +8,33 @@
 
 pa = pytest.importorskip("pyarrow", minversion="1.0.0")
 
+from pandas.core.arrays.string_ import (
+    StringArray,
+    StringDtype,
+)
 from pandas.core.arrays.string_arrow import ArrowStringArray
 
 
 def test_eq_all_na():
-    a = pd.array([pd.NA, pd.NA], dtype=pd.StringDtype("pyarrow"))
+    a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow"))
     result = a == a
     expected = pd.array([pd.NA, pd.NA], dtype="boolean")
     tm.assert_extension_array_equal(result, expected)
 
 
-def test_config():
-    # python by default
-    assert pd.StringDtype().storage == "python"
-    arr = pd.array(["a", "b"])
-    assert arr.dtype.storage == "python"
+def test_config(string_storage):
+    with pd.option_context("string_storage", string_storage):
+        assert StringDtype().storage == string_storage
+        result = pd.array(["a", "b"])
+        assert result.dtype.storage == string_storage
 
-    with pd.option_context("mode.string_storage", "pyarrow"):
-        assert pd.StringDtype().storage == "pyarrow"
-        arr = pd.array(["a", "b"])
-        assert arr.dtype.storage == "pyarrow"
+    expected = (
+        StringDtype(string_storage).construct_array_type()._from_sequence(["a", "b"])
+    )
+    tm.assert_equal(result, expected)
 
+
+def test_config_bad_storage_raises():
     msg = re.escape("Value must be one of python|pyarrow")
     with pytest.raises(ValueError, match=msg):
         pd.options.mode.string_storage = "foo"
@@ -50,3 +56,51 @@ def test_constructor_not_string_type_raises(array, chunked):
         )
     with pytest.raises(ValueError, match=msg):
         ArrowStringArray(arr)
+
+
+def test_from_sequence_wrong_dtype_raises():
+    with pd.option_context("string_storage", "python"):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    with pd.option_context("string_storage", "pyarrow"):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    with pytest.raises(AssertionError, match=None):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[python]")
+
+    ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")
+
+    with pytest.raises(AssertionError, match=None):
+        with pd.option_context("string_storage", "python"):
+            ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    with pd.option_context("string_storage", "pyarrow"):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    with pytest.raises(AssertionError, match=None):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python"))
+
+    ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))
+
+    with pd.option_context("string_storage", "python"):
+        StringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    with pd.option_context("string_storage", "pyarrow"):
+        StringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    StringArray._from_sequence(["a", None, "c"], dtype="string[python]")
+
+    with pytest.raises(AssertionError, match=None):
+        StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")
+
+    with pd.option_context("string_storage", "python"):
+        StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    with pytest.raises(AssertionError, match=None):
+        with pd.option_context("string_storage", "pyarrow"):
+            StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python"))
+
+    with pytest.raises(AssertionError, match=None):
+        StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))
diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
index bfe588883d9f3..61d56df485ab1 100644
--- a/pandas/tests/arrays/test_array.py
+++ b/pandas/tests/arrays/test_array.py
@@ -18,7 +18,6 @@
     IntegerArray,
     IntervalArray,
     SparseArray,
-    StringArray,
     TimedeltaArray,
 )
 from pandas.core.arrays import (
@@ -132,8 +131,16 @@
         ([1, None], "Int16", pd.array([1, None], dtype="Int16")),
         (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
         # String
-        (["a", None], "string", StringArray._from_sequence(["a", None])),
-        (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None])),
+        (
+            ["a", None],
+            "string",
+            pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
+        ),
+        (
+            ["a", None],
+            pd.StringDtype(),
+            pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
+        ),
         # Boolean
         ([True, None], "boolean", BooleanArray._from_sequence([True, None])),
         ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])),
@@ -253,8 +260,14 @@ def test_array_copy():
         ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])),
         ([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])),
         # string
-        (["a", "b"], StringArray._from_sequence(["a", "b"])),
-        (["a", None], StringArray._from_sequence(["a", None])),
+        (
+            ["a", "b"],
+            pd.StringDtype().construct_array_type()._from_sequence(["a", "b"]),
+        ),
+        (
+            ["a", None],
+            pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
+        ),
         # Boolean
         ([True, False], BooleanArray._from_sequence([True, False])),
         ([True, None], BooleanArray._from_sequence([True, None])),
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index c6f8efe7b939e..0bd10b36a8b5c 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -298,7 +298,7 @@ def test_searchsorted(self):
             assert result == 10
 
     @pytest.mark.parametrize("box", [None, "index", "series"])
-    def test_searchsorted_castable_strings(self, arr1d, box, request):
+    def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage):
         if isinstance(arr1d, DatetimeArray):
             tz = arr1d.tz
             ts1, ts2 = arr1d[1:3]
@@ -341,14 +341,17 @@ def test_searchsorted_castable_strings(self, arr1d, box, request):
         ):
             arr.searchsorted("foo")
 
-        with pytest.raises(
-            TypeError,
-            match=re.escape(
-                f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
-                "or array of those. Got 'StringArray' instead."
-            ),
-        ):
-            arr.searchsorted([str(arr[1]), "baz"])
+        arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray"
+
+        with pd.option_context("string_storage", string_storage):
+            with pytest.raises(
+                TypeError,
+                match=re.escape(
+                    f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
+                    f"or array of those. Got '{arr_type}' instead."
+                ),
+            ):
+                arr.searchsorted([str(arr[1]), "baz"])
 
     def test_getitem_near_implementation_bounds(self):
         # We only check tz-naive for DTA bc the bounds are slightly different
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
index 157b76c630ef9..ae3958995864c 100644
--- a/pandas/tests/series/methods/test_astype.py
+++ b/pandas/tests/series/methods/test_astype.py
@@ -12,6 +12,7 @@
 from pandas._libs.tslibs import iNaT
 import pandas.util._test_decorators as td
 
+import pandas as pd
 from pandas import (
     NA,
     Categorical,
@@ -377,7 +378,9 @@ class TestAstypeString:
             # currently no way to parse IntervalArray from a list of strings
         ],
     )
-    def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request):
+    def test_astype_string_to_extension_dtype_roundtrip(
+        self, data, dtype, request, string_storage
+    ):
         if dtype == "boolean" or (
             dtype in ("period[M]", "datetime64[ns]", "timedelta64[ns]") and NaT in data
         ):
@@ -385,9 +388,24 @@ def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request):
                 reason="TODO StringArray.astype() with missing values #GH40566"
             )
             request.node.add_marker(mark)
+
+        if string_storage == "pyarrow" and dtype in (
+            "category",
+            "datetime64[ns]",
+            "datetime64[ns, US/Eastern]",
+            "UInt16",
+            "period[M]",
+        ):
+            mark = pytest.mark.xfail(
+                reason="TypeError: Cannot interpret ... as a data type"
+            )
+            request.node.add_marker(mark)
+
         # GH-40351
         s = Series(data, dtype=dtype)
-        tm.assert_series_equal(s, s.astype("string").astype(dtype))
+        with pd.option_context("string_storage", string_storage):
+            result = s.astype("string").astype(dtype)
+        tm.assert_series_equal(result, s)
 
 
 class TestAstypeCategorical:
diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py
index c0ae06802bdb1..6cbf2dd606692 100644
--- a/pandas/tests/strings/test_api.py
+++ b/pandas/tests/strings/test_api.py
@@ -6,6 +6,7 @@
     MultiIndex,
     Series,
     _testing as tm,
+    get_option,
 )
 from pandas.core import strings as strings
 
@@ -128,7 +129,9 @@ def test_api_per_method(
 def test_api_for_categorical(any_string_method, any_string_dtype, request):
     # https://github.com/pandas-dev/pandas/issues/10661
 
-    if any_string_dtype == "string[pyarrow]":
+    if any_string_dtype == "string[pyarrow]" or (
+        any_string_dtype == "string" and get_option("string_storage") == "pyarrow"
+    ):
         # unsupported operand type(s) for +: 'ArrowStringArray' and 'str'
         mark = pytest.mark.xfail(raises=TypeError, reason="Not Implemented")
         request.node.add_marker(mark)

From 8adb08d481777e7a6aca2fe5f390d3c36b0c1ae9 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 31 May 2021 13:52:03 +0100
Subject: [PATCH 35/46] use string_storage fixture more

---
 pandas/tests/arrays/string_/test_string.py    | 27 +++++--------------
 pandas/tests/extension/test_string.py         | 13 +++------
 .../frame/methods/test_convert_dtypes.py      | 11 --------
 3 files changed, 9 insertions(+), 42 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index ae2fedef1d947..5179378b86ba0 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -14,30 +14,15 @@
 import pandas._testing as tm
 from pandas.core.arrays.string_arrow import ArrowStringArray
 
-skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0")
 
-
-@pytest.fixture(
-    params=[
-        pd.StringDtype(storage="python"),
-        pytest.param(
-            pd.StringDtype(storage="pyarrow"),
-            marks=skip_if_no_pyarrow,
-        ),
-    ]
-)
-def dtype(request):
-    return request.param
+@pytest.fixture
+def dtype(string_storage):
+    return pd.StringDtype(storage=string_storage)
 
 
-@pytest.fixture(
-    params=[
-        pd.arrays.StringArray,
-        pytest.param(ArrowStringArray, marks=skip_if_no_pyarrow),
-    ]
-)
-def cls(request):
-    return request.param
+@pytest.fixture
+def cls(dtype):
+    return dtype.construct_array_type()
 
 
 def test_repr(dtype):
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 269ede7ca93ae..02e1cb31fd41a 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -18,8 +18,6 @@
 import numpy as np
 import pytest
 
-import pandas.util._test_decorators as td
-
 import pandas as pd
 from pandas.core.arrays.string_ import StringDtype
 from pandas.tests.extension import base
@@ -48,14 +46,9 @@ def chunked(request):
     return request.param
 
 
-@pytest.fixture(
-    params=[
-        "python",
-        pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")),
-    ]
-)
-def dtype(request):
-    return StringDtype(storage=request.param)
+@pytest.fixture
+def dtype(string_storage):
+    return StringDtype(storage=string_storage)
 
 
 @pytest.fixture
diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py
index 2d3f0011617fc..a2d539d784d3c 100644
--- a/pandas/tests/frame/methods/test_convert_dtypes.py
+++ b/pandas/tests/frame/methods/test_convert_dtypes.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-import pandas.util._test_decorators as td
-
 import pandas as pd
 import pandas._testing as tm
 
@@ -11,15 +9,6 @@ class TestConvertDtypes:
     @pytest.mark.parametrize(
         "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
     )
-    @pytest.mark.parametrize(
-        "string_storage",
-        [
-            "python",
-            pytest.param(
-                "pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
-            ),
-        ],
-    )
     def test_convert_dtypes(self, convert_integer, expected, string_storage):
         # Specific types are tested in tests/series/test_dtypes.py
         # Just check that it works for DataFrame here

From 56714c9fd3a9a0158a19bc533b2848dd2275112b Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 31 May 2021 19:11:16 +0100
Subject: [PATCH 36/46] post merge fixup

---
 pandas/core/arrays/string_.py                    |  6 ++++++
 pandas/tests/arrays/string_/test_string_arrow.py | 14 ++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index c80c1263c3bf8..f33311d4d9114 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -19,6 +19,7 @@
     Scalar,
     type_t,
 )
+from pandas.compat import pa_version_under1p0
 from pandas.compat.numpy import function as nv
 
 from pandas.core.dtypes.base import (
@@ -94,6 +95,11 @@ def __init__(self, storage=None):
             raise ValueError(
                 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
             )
+        if storage == "pyarrow" and pa_version_under1p0:
+            raise ImportError(
+                "pyarrow>=1.0.0 is required for PyArrow backed StringArray."
+            )
+
         self.storage = storage
 
     @property
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index 443984cef7687..c3f951adf7f89 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -11,11 +11,15 @@
     StringArray,
     StringDtype,
 )
-from pandas.core.arrays.string_arrow import (
-    ArrowStringArray,
+from pandas.core.arrays.string_arrow import ArrowStringArray
+
+skip_if_no_pyarrow = pytest.mark.skipif(
+    pa_version_under1p0,
+    reason="pyarrow>=1.0.0 is required for PyArrow backed StringArray",
 )
 
 
+@skip_if_no_pyarrow
 def test_eq_all_na():
     a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow"))
     result = a == a
@@ -41,10 +45,7 @@ def test_config_bad_storage_raises():
         pd.options.mode.string_storage = "foo"
 
 
-@pytest.mark.skipif(
-    pa_version_under1p0,
-    reason="pyarrow>=1.0.0 is required for PyArrow backed StringArray",
-)
+@skip_if_no_pyarrow
 @pytest.mark.parametrize("chunked", [True, False])
 @pytest.mark.parametrize("array", ["numpy", "pyarrow"])
 def test_constructor_not_string_type_raises(array, chunked):
@@ -67,6 +68,7 @@ def test_constructor_not_string_type_raises(array, chunked):
         ArrowStringArray(arr)
 
 
+@skip_if_no_pyarrow
 def test_from_sequence_wrong_dtype_raises():
     with pd.option_context("string_storage", "python"):
         ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")

From 1761a84eb2e7e89956334881be1ed27801b8bf38 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 2 Jun 2021 10:15:17 +0100
Subject: [PATCH 37/46] remove accessor methods section from release note

---
 doc/source/whatsnew/v1.3.0.rst | 56 +---------------------------------
 1 file changed, 1 insertion(+), 55 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index a0e8a4905f20b..12f5ced79b934 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -222,61 +222,7 @@ String accessor methods returning integers will return a value with :class:`Int6
 
    s.str.count("a")
 
-Some string accessor methods use native PyArrow string kernels operating directly on the
-PyArrow memory, others fallback to converting to a NumPy array of Python objects and
-using the native Python string functions. String methods using Pyarrow kernels are
-generally much more performant.
-
-Some PyArrow string kernels are implemented in later versions of pyarrow that the
-minimum version required to create a PyArrow backed StringArray. In these cases, the
-string accessor will fall back to the Python implementations.
-
-Some string accessor methods accept arguments controlling their behaviour which are not
-supported by the PyArrow kernels. These cases will also fall back to object mode.
-
-+--------------------------------+----------+------------------------------------------+
-| Accessor                       | Minimum  | Limitations (otherwise fall back to      |
-| Method                         | PyArrow  | object mode)                             |
-|                                | Version  |                                          |
-+================================+==========+==========================================+
-| :meth:`~Series.str.contains`   | 1.0.0    | The ``flags`` argument is not supported. |
-|                                |          | If ``regex=True``, pyarrow 4.0.0 is      |
-|                                |          | required and ``case=False`` is not       |
-|                                |          | supported.                               |
-+--------------------------------+----------+------------------------------------------+
-| :meth:`~Series.str.startswith` | 4.0.0    |                                          |
-| :meth:`~Series.str.endswith`   |          |                                          |
-+--------------------------------+----------+------------------------------------------+
-| :meth:`~Series.str.replace`    | 4.0.0    | The ``flags`` argument, ``case=False``,  |
-|                                |          | passing a callable for the ``repr``      |
-|                                |          | argument or passing a compiled regex is  |
-|                                |          | not supported.                           |
-+--------------------------------+----------+------------------------------------------+
-| :meth:`~Series.str.match`      | 4.0.0    |                                          |
-| :meth:`~Series.str.fullmatch`  |          |                                          |
-+--------------------------------+----------+------------------------------------------+
-| :meth:`~Series.str.isalnum`    | 1.0.0    |                                          |
-| :meth:`~Series.str.isalpha`    |          |                                          |
-| :meth:`~Series.str.isdecimal`  |          |                                          |
-| :meth:`~Series.str.isdigit`    |          |                                          |
-| :meth:`~Series.str.islower`    |          |                                          |
-| :meth:`~Series.str.isnumeric`  |          |                                          |
-| :meth:`~Series.str.istitle`    |          |                                          |
-| :meth:`~Series.str.isupper`    |          |                                          |
-+--------------------------------+----------+------------------------------------------+
-| :meth:`~Series.str.isspace`    | 2.0.0    |                                          |
-+--------------------------------+----------+------------------------------------------+
-| :meth:`~Series.str.len`        | 4.0.0    |                                          |
-+--------------------------------+----------+------------------------------------------+
-| :meth:`~Series.str.lower`      | 1.0.0    |                                          |
-| :meth:`~Series.str.upper`      |          |                                          |
-+--------------------------------+----------+------------------------------------------+
-| :meth:`~Series.str.strip`      | 4.0.0    |                                          |
-| :meth:`~Series.str.lstrip`     |          |                                          |
-| :meth:`~Series.str.rstrip`     |          |                                          |
-+--------------------------------+----------+------------------------------------------+
-
-
+See :ref:`text.types` for more.
 
 Centered Datetime-Like Rolling Windows
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

From 3e26baa7c29402010538db70be59c59681fe9889 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 2 Jun 2021 10:27:42 +0100
Subject: [PATCH 38/46] consistent dtype naming in benchmark

---
 asv_bench/benchmarks/algorithms.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 8885a0dcc781e..e48a2060a3b34 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -23,7 +23,7 @@ class Factorize:
             "int",
             "uint",
             "float",
-            "string",
+            "object",
             "datetime64[ns]",
             "datetime64[ns, tz]",
             "Int64",
@@ -47,7 +47,7 @@ def setup(self, unique, sort, dtype):
             "int": pd.Int64Index(np.arange(N)),
             "uint": pd.UInt64Index(np.arange(N)),
             "float": pd.Float64Index(np.random.randn(N)),
-            "string": string_index,
+            "object": string_index,
             "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
             "datetime64[ns, tz]": pd.date_range(
                 "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"

From 6b470b1ed53f045bcb9bb5caf20ba1aaca3e2137 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 2 Jun 2021 12:30:49 +0100
Subject: [PATCH 39/46] Apply suggestions from code review

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 doc/source/whatsnew/v1.3.0.rst | 2 +-
 pandas/core/construction.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 12f5ced79b934..81b9b4132b8c9 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -178,7 +178,7 @@ PyArrow backed string data type
 We've enhanced the :class:`StringDtype`, an extension type dedicated to string data.
 (:issue:`39908`)
 
-It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`, use
+It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`. Use
 pandas options or specify the dtype using ``dtype='string[pyarrow]'`` to allow the
 StringArray to be backed by a PyArrow array instead of a NumPy array of Python objects.
 
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 061d9aadfbd7b..92eff02ec1307 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -125,7 +125,7 @@ def array(
         ============================== =======================================
 
         The ExtensionArray created when the scalar type is :class:`str` is determined by
-        pd.options.mode.string_storage if the dtype is not explicitly given.
+        ``pd.options.mode.string_storage`` if the dtype is not explicitly given.
 
         For all other cases, NumPy's usual inference rules will be used.
 

From 2ec6de0446abe9ba9c33de331d97c232fb1e55ba Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 2 Jun 2021 14:53:03 +0100
Subject: [PATCH 40/46] name and str() change to "string"

---
 pandas/_testing/asserters.py               | 13 +++++++++++--
 pandas/core/arrays/string_.py              | 13 +++++++------
 pandas/core/arrays/string_arrow.py         |  2 +-
 pandas/core/construction.py                |  4 ++--
 pandas/core/generic.py                     | 14 +++++++-------
 pandas/core/strings/accessor.py            |  2 +-
 pandas/tests/arrays/string_/test_string.py |  8 ++------
 pandas/tests/extension/test_string.py      |  8 ++++++--
 8 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
index 2d695458e32e6..ccd73810981d3 100644
--- a/pandas/_testing/asserters.py
+++ b/pandas/_testing/asserters.py
@@ -48,6 +48,7 @@
     TimedeltaArray,
 )
 from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
+from pandas.core.arrays.string_ import StringDtype
 
 from pandas.io.formats.printing import pprint_thing
 
@@ -638,12 +639,20 @@ def raise_assert_detail(obj, message, left, right, diff=None, index_values=None)
 
     if isinstance(left, np.ndarray):
         left = pprint_thing(left)
-    elif is_categorical_dtype(left) or isinstance(left, PandasDtype):
+    elif (
+        is_categorical_dtype(left)
+        or isinstance(left, PandasDtype)
+        or isinstance(left, StringDtype)
+    ):
         left = repr(left)
 
     if isinstance(right, np.ndarray):
         right = pprint_thing(right)
-    elif is_categorical_dtype(right) or isinstance(right, PandasDtype):
+    elif (
+        is_categorical_dtype(right)
+        or isinstance(right, PandasDtype)
+        or isinstance(right, StringDtype)
+    ):
         right = repr(right)
 
     msg += f"""
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index f33311d4d9114..5a3c27b7ae0af 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -84,6 +84,8 @@ class StringDtype(ExtensionDtype):
     string[python]
     """
 
+    name = "string"
+
     #: StringDtype.na_value uses pandas.NA
     na_value = libmissing.NA
     _metadata = ("storage",)
@@ -102,10 +104,6 @@ def __init__(self, storage=None):
 
         self.storage = storage
 
-    @property
-    def name(self):
-        return f"string[{self.storage}]"
-
     @property
     def type(self) -> type[str]:
         return str
@@ -182,6 +180,9 @@ def construct_array_type(  # type: ignore[override]
             return ArrowStringArray
 
     def __repr__(self):
+        return f"string[{self.storage}]"
+
+    def __str__(self):
         return self.name
 
     def __from_arrow__(
@@ -268,7 +269,7 @@ class StringArray(PandasArray):
     >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
     <StringArray>
     ['This is', 'some text', <NA>, 'data.']
-    Length: 4, dtype: string[python]
+    Length: 4, dtype: string
 
     Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
     will convert the values to strings.
@@ -280,7 +281,7 @@ class StringArray(PandasArray):
     >>> pd.array(['1', 1], dtype="string")
     <StringArray>
     ['1', '1']
-    Length: 2, dtype: string[python]
+    Length: 2, dtype: string
 
     However, instantiating StringArrays directly with non-strings will raise an error.
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 09c4c519bfc2b..019b94f5dd207 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -127,7 +127,7 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin):
     >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
     <ArrowStringArray>
     ['This is', 'some text', <NA>, 'data.']
-    Length: 4, dtype: string[pyarrow]
+    Length: 4, dtype: string
     """
 
     def __init__(self, values):
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 92eff02ec1307..2632b5ba2d287 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -239,7 +239,7 @@ def array(
     >>> pd.array(["a", None, "c"])
     <StringArray>
     ['a', <NA>, 'c']
-    Length: 3, dtype: string[python]
+    Length: 3, dtype: string
 
     >>> with pd.option_context("string_storage", "pyarrow"):
     ...     arr = pd.array(["a", None, "c"])
@@ -247,7 +247,7 @@ def array(
     >>> arr
     <ArrowStringArray>
     ['a', <NA>, 'c']
-    Length: 3, dtype: string[pyarrow]
+    Length: 3, dtype: string
 
     >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
     <PeriodArray>
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index a15d602e0d724..49dc71954fd8f 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -6139,12 +6139,12 @@ def convert_dtypes(
         2  3  z   <NA>  <NA>    20  200.0
 
         >>> dfn.dtypes
-        a              Int32
-        b     string[python]
-        c            boolean
-        d     string[python]
-        e              Int64
-        f            Float64
+        a      Int32
+        b     string
+        c    boolean
+        d     string
+        e      Int64
+        f    Float64
         dtype: object
 
         Start with a Series of strings and missing data represented by ``np.nan``.
@@ -6162,7 +6162,7 @@ def convert_dtypes(
         0       a
         1       b
         2    <NA>
-        dtype: string[python]
+        dtype: string
         """
         if self.ndim == 1:
             return self._convert_dtypes(
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 7643019ff8c55..aa867ae4dd401 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -3080,7 +3080,7 @@ def _result_dtype(arr):
     from pandas.core.arrays.string_ import StringDtype
 
     if isinstance(arr.dtype, StringDtype):
-        return arr.dtype.name
+        return arr.dtype
     else:
         return object
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 214f2184ee2fe..92d0d19901b21 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -32,15 +32,11 @@ def test_repr(dtype):
     expected = "      A\n0     a\n1  <NA>\n2     b"
     assert repr(df) == expected
 
-    expected = (
-        f"0       a\n1    <NA>\n2       b\nName: A, dtype: string[{dtype.storage}]"
-    )
+    expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
     assert repr(df.A) == expected
 
     arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray"
-    expected = (
-        f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string[{dtype.storage}]"
-    )
+    expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
     assert repr(df.A.array) == expected
 
 
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 02e1cb31fd41a..3d0edb70d1ced 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -94,7 +94,9 @@ def data_for_grouping(dtype, chunked):
 
 
 class TestDtype(base.BaseDtypeTests):
-    pass
+    def test_eq_with_str(self, dtype):
+        assert dtype == f"string[{dtype.storage}]"
+        super().test_eq_with_str(dtype)
 
 
 class TestInterface(base.BaseInterfaceTests):
@@ -106,7 +108,9 @@ def test_view(self, data, request):
 
 
 class TestConstructors(base.BaseConstructorsTests):
-    pass
+    def test_from_dtype(self, data):
+        # base test uses string representation of dtype
+        pass
 
 
 class TestReshaping(base.BaseReshapingTests):

From a0b7a70f347a9575268ecadde647faed578ebbf1 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 2 Jun 2021 15:28:09 +0100
Subject: [PATCH 41/46] remove testing of sting dtype without storage
 specified.

---
 pandas/conftest.py                         | 4 ----
 pandas/tests/extension/base/casting.py     | 1 -
 pandas/tests/frame/methods/test_astype.py  | 1 -
 pandas/tests/series/methods/test_astype.py | 2 --
 pandas/tests/series/methods/test_update.py | 1 -
 5 files changed, 9 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 690a0a524e69a..f1c0280bc52bb 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1120,7 +1120,6 @@ def string_dtype(request):
 
 @pytest.fixture(
     params=[
-        "string",
         "string[python]",
         pytest.param(
             "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
@@ -1131,7 +1130,6 @@ def nullable_string_dtype(request):
     """
     Parametrized fixture for string dtypes.
 
-    * 'string'
     * 'string[python]'
     * 'string[pyarrow]'
     """
@@ -1179,7 +1177,6 @@ def object_dtype(request):
 @pytest.fixture(
     params=[
         "object",
-        "string",
         "string[python]",
         pytest.param(
             "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
@@ -1190,7 +1187,6 @@ def any_string_dtype(request):
     """
     Parametrized fixture for string dtypes.
     * 'object'
-    * 'string'
     * 'string[python]'
     * 'string[pyarrow]'
     """
diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
index ed1047f6e28f6..9c59c79f677de 100644
--- a/pandas/tests/extension/base/casting.py
+++ b/pandas/tests/extension/base/casting.py
@@ -48,7 +48,6 @@ def test_astype_str(self, data):
     @pytest.mark.parametrize(
         "nullable_string_dtype",
         [
-            "string",
             "string[python]",
             pytest.param(
                 "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index e8b533f0c8817..881f8db305240 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -584,7 +584,6 @@ def test_astype_empty_dtype_dict(self):
     @pytest.mark.parametrize(
         "data, dtype",
         [
-            (["x", "y", "z"], "string"),
             (["x", "y", "z"], "string[python]"),
             pytest.param(
                 ["x", "y", "z"],
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
index c610475581952..99a7ba910eb74 100644
--- a/pandas/tests/series/methods/test_astype.py
+++ b/pandas/tests/series/methods/test_astype.py
@@ -249,7 +249,6 @@ def test_td64_series_astype_object(self):
     @pytest.mark.parametrize(
         "data, dtype",
         [
-            (["x", "y", "z"], "string"),
             (["x", "y", "z"], "string[python]"),
             pytest.param(
                 ["x", "y", "z"],
@@ -387,7 +386,6 @@ def test_astype_string_to_extension_dtype_roundtrip(
                 reason="TODO StringArray.astype() with missing values #GH40566"
             )
             request.node.add_marker(mark)
-
         # GH-40351
         s = Series(data, dtype=dtype)
         result = s.astype(nullable_string_dtype).astype(dtype)
diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py
index 98cfb4cd6414d..d9d6641d54237 100644
--- a/pandas/tests/series/methods/test_update.py
+++ b/pandas/tests/series/methods/test_update.py
@@ -86,7 +86,6 @@ def test_update_from_non_series(self, series, other, expected):
     @pytest.mark.parametrize(
         "data, other, expected, dtype",
         [
-            (["a", None], [None, "b"], ["a", "b"], "string"),
             (["a", None], [None, "b"], ["a", "b"], "string[python]"),
             pytest.param(
                 ["a", None],

From d9dcd20f99ca6fc2f0a08eb2a3f4a54f29309af3 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 2 Jun 2021 15:42:28 +0100
Subject: [PATCH 42/46] update StringDtype docstring

---
 pandas/core/arrays/string_.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 5a3c27b7ae0af..b57181987d90b 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -70,6 +70,11 @@ class StringDtype(ExtensionDtype):
        In particular, StringDtype.na_value may change to no longer be
        ``numpy.nan``.
 
+    Parameters
+    ----------
+    storage : {"python", "pyarrow"}, optional
+        If not given, the value of ``pd.options.mode.string_storage``.
+
     Attributes
     ----------
     None
@@ -82,6 +87,9 @@ class StringDtype(ExtensionDtype):
     --------
     >>> pd.StringDtype()
     string[python]
+
+    >>> pd.StringDtype(storage="pyarrow")
+    string[pyarrow]
     """
 
     name = "string"

From 4a37470d4802df49ea03982178d939348016bea3 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 2 Jun 2021 16:17:19 +0100
Subject: [PATCH 43/46] add ArrowStringArray to pd.arrays namespace

---
 doc/source/reference/arrays.rst    | 1 +
 pandas/arrays/__init__.py          | 2 ++
 pandas/core/arrays/__init__.py     | 2 ++
 pandas/core/arrays/string_arrow.py | 3 ++-
 4 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index 43e2509469488..c6fda85b0486d 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -480,6 +480,7 @@ we recommend using :class:`StringDtype` (with the alias ``"string"``).
    :template: autosummary/class_without_autosummary.rst
 
    arrays.StringArray
+   arrays.ArrowStringArray
 
 .. autosummary::
    :toctree: api/
diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py
index 0fa070b6e4fc4..89d362eb77e68 100644
--- a/pandas/arrays/__init__.py
+++ b/pandas/arrays/__init__.py
@@ -4,6 +4,7 @@
 See :ref:`extending.extension-types` for more.
 """
 from pandas.core.arrays import (
+    ArrowStringArray,
     BooleanArray,
     Categorical,
     DatetimeArray,
@@ -18,6 +19,7 @@
 )
 
 __all__ = [
+    "ArrowStringArray",
     "BooleanArray",
     "Categorical",
     "DatetimeArray",
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
index 22f15ca9650db..e301e82a0ee75 100644
--- a/pandas/core/arrays/__init__.py
+++ b/pandas/core/arrays/__init__.py
@@ -17,12 +17,14 @@
 )
 from pandas.core.arrays.sparse import SparseArray
 from pandas.core.arrays.string_ import StringArray
+from pandas.core.arrays.string_arrow import ArrowStringArray
 from pandas.core.arrays.timedeltas import TimedeltaArray
 
 __all__ = [
     "ExtensionArray",
     "ExtensionOpsMixin",
     "ExtensionScalarOpsMixin",
+    "ArrowStringArray",
     "BaseMaskedArray",
     "BooleanArray",
     "Categorical",
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 019b94f5dd207..1539c6db6317f 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -673,7 +673,8 @@ def astype(self, dtype, copy=True):
     # ------------------------------------------------------------------------
     # String methods interface
 
-    _str_na_value = StringDtype.na_value
+    # error: Cannot determine type of 'na_value'
+    _str_na_value = StringDtype.na_value  # type: ignore[has-type]
 
     def _str_map(
         self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True

From 1d59c7a84c9fbef748e60febd92ce983da9534ba Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 2 Jun 2021 16:50:28 +0100
Subject: [PATCH 44/46] add common base class, BaseStringArray

---
 pandas/core/arrays/string_.py       | 13 ++++++++-----
 pandas/core/arrays/string_arrow.py  |  7 +++++--
 pandas/core/dtypes/cast.py          |  8 ++------
 pandas/core/strings/object_array.py |  5 ++---
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index b57181987d90b..8d150c8f6ad3d 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -43,6 +43,7 @@
     IntegerArray,
     PandasArray,
 )
+from pandas.core.arrays.base import ExtensionArray
 from pandas.core.arrays.floating import FloatingDtype
 from pandas.core.arrays.integer import _IntegerDtype
 from pandas.core.construction import extract_array
@@ -52,8 +53,6 @@
 if TYPE_CHECKING:
     import pyarrow
 
-    from pandas.core.arrays.string_arrow import ArrowStringArray
-
 
 @register_extension_dtype
 class StringDtype(ExtensionDtype):
@@ -172,7 +171,7 @@ def __hash__(self) -> int:
     # "ExtensionDtype"
     def construct_array_type(  # type: ignore[override]
         self,
-    ) -> type_t[StringArray | ArrowStringArray]:
+    ) -> type_t[BaseStringArray]:
         """
         Return the array type associated with this dtype.
 
@@ -195,7 +194,7 @@ def __str__(self):
 
     def __from_arrow__(
         self, array: pyarrow.Array | pyarrow.ChunkedArray
-    ) -> StringArray | ArrowStringArray:
+    ) -> BaseStringArray:
         """
         Construct StringArray from pyarrow Array/ChunkedArray.
         """
@@ -225,7 +224,11 @@ def __from_arrow__(
             return StringArray(np.array([], dtype="object"))
 
 
-class StringArray(PandasArray):
+class BaseStringArray(ExtensionArray):
+    pass
+
+
+class StringArray(BaseStringArray, PandasArray):
     """
     Extension array for string data.
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 1539c6db6317f..ab8599f0f05ba 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -47,7 +47,10 @@
 from pandas.core.arrays.boolean import BooleanDtype
 from pandas.core.arrays.integer import Int64Dtype
 from pandas.core.arrays.numeric import NumericDtype
-from pandas.core.arrays.string_ import StringDtype
+from pandas.core.arrays.string_ import (
+    BaseStringArray,
+    StringDtype,
+)
 from pandas.core.indexers import (
     check_array_indexer,
     validate_indices,
@@ -86,7 +89,7 @@ def _chk_pyarrow_available() -> None:
 # fallback for the ones that pyarrow doesn't yet support
 
 
-class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin):
+class ArrowStringArray(OpsMixin, BaseStringArray, ObjectStringArrayMixin):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
 
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 5c7211a5d1852..73463db401ea5 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -420,18 +420,14 @@ def maybe_cast_to_extension_array(
     -------
     ExtensionArray or obj
     """
-    from pandas.core.arrays.string_ import StringArray
-    from pandas.core.arrays.string_arrow import ArrowStringArray
+    from pandas.core.arrays.string_ import BaseStringArray
 
     assert isinstance(cls, type), f"must pass a type: {cls}"
     assertion_msg = f"must pass a subclass of ExtensionArray: {cls}"
     assert issubclass(cls, ABCExtensionArray), assertion_msg
 
     # Everything can be converted to StringArrays, but we may not want to convert
-    if (
-        issubclass(cls, (StringArray, ArrowStringArray))
-        and lib.infer_dtype(obj) != "string"
-    ):
+    if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string":
         return obj
 
     try:
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 7ce4abe904f3b..02bdb7f181583 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -173,8 +173,7 @@ def scalar_rep(x):
 
             return self._str_map(scalar_rep, dtype=str)
         else:
-            from pandas.core.arrays.string_ import StringArray
-            from pandas.core.arrays.string_arrow import ArrowStringArray
+            from pandas.core.arrays.string_ import BaseStringArray
 
             def rep(x, r):
                 if x is libmissing.NA:
@@ -186,7 +185,7 @@ def rep(x, r):
 
             repeats = np.asarray(repeats, dtype=object)
             result = libops.vec_binop(np.asarray(self), repeats, rep)
-            if isinstance(self, (StringArray, ArrowStringArray)):
+            if isinstance(self, BaseStringArray):
                 # Not going through map, so we have to do this here.
                 result = type(self)._from_sequence(result)
             return result

From 51f1b1d7ce878b40826cb96d7e661aae9ab2b726 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 4 Jun 2021 20:40:24 +0100
Subject: [PATCH 45/46] fixup roundtrip tests

---
 pandas/conftest.py                         |  4 ++++
 pandas/tests/arrays/string_/test_string.py | 20 ++++++++++++--------
 pandas/tests/io/test_parquet.py            |  5 +++--
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index f1c0280bc52bb..e106f7f425fa0 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1152,6 +1152,10 @@ def string_storage(request):
     return request.param
 
 
+# Alias so we can test with cartesian product of string_storage
+string_storage2 = string_storage
+
+
 @pytest.fixture(params=tm.BYTES_DTYPES)
 def bytes_dtype(request):
     """
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 92d0d19901b21..5731f02430a9d 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -431,7 +431,7 @@ def test_arrow_array(dtype):
 
 
 @td.skip_if_no("pyarrow")
-def test_arrow_roundtrip(dtype):
+def test_arrow_roundtrip(dtype, string_storage2):
     # roundtrip possible from arrow 1.0.0
     import pyarrow as pa
 
@@ -439,15 +439,17 @@ def test_arrow_roundtrip(dtype):
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
     assert table.field("a").type == "string"
-    result = table.to_pandas()
-    assert isinstance(result["a"].dtype, type(dtype))
-    tm.assert_frame_equal(result, df)
+    with pd.option_context("string_storage", string_storage2):
+        result = table.to_pandas()
+    assert isinstance(result["a"].dtype, pd.StringDtype)
+    expected = df.astype(f"string[{string_storage2}]")
+    tm.assert_frame_equal(result, expected)
     # ensure the missing value is represented by NA and not np.nan or None
     assert result.loc[2, "a"] is pd.NA
 
 
 @td.skip_if_no("pyarrow")
-def test_arrow_load_from_zero_chunks(dtype):
+def test_arrow_load_from_zero_chunks(dtype, string_storage2):
     # GH-41040
     import pyarrow as pa
 
@@ -457,9 +459,11 @@ def test_arrow_load_from_zero_chunks(dtype):
     assert table.field("a").type == "string"
     # Instantiate the same table with no chunks at all
     table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
-    result = table.to_pandas()
-    assert isinstance(result["a"].dtype, type(dtype))
-    tm.assert_frame_equal(result, df)
+    with pd.option_context("string_storage", string_storage2):
+        result = table.to_pandas()
+    assert isinstance(result["a"].dtype, pd.StringDtype)
+    expected = df.astype(f"string[{string_storage2}]")
+    tm.assert_frame_equal(result, expected)
 
 
 def test_value_counts_na(dtype):
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 8535cb0fd8bad..d100c584b698a 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -810,10 +810,11 @@ def test_additional_extension_arrays(self, pa):
         check_round_trip(df, pa)
 
     @td.skip_if_no("pyarrow", min_version="1.0.0")
-    def test_pyarrow_backed_string_array(self, pa):
+    def test_pyarrow_backed_string_array(self, pa, string_storage):
         # test ArrowStringArray supported through the __arrow_array__ protocol
         df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
-        check_round_trip(df, pa, expected=df)
+        with pd.option_context("string_storage", string_storage):
+            check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]"))
 
     @td.skip_if_no("pyarrow")
     def test_additional_extension_types(self, pa):

From ef02a435ec481eeac62fe00ebc87d501489e13fc Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 7 Jun 2021 11:27:30 +0100
Subject: [PATCH 46/46] remove link

---
 doc/source/whatsnew/v1.3.0.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 57ca921ff5ba3..c2f25b389c9eb 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -223,8 +223,6 @@ String accessor methods returning integers will return a value with :class:`Int6
 
    s.str.count("a")
 
-See :ref:`text.types` for more.
-
 Centered Datetime-Like Rolling Windows
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^