From 4c2e37a60c4390f02cc45f31626fac9ed2504ca0 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Fri, 10 Jul 2020 20:19:15 +0200
Subject: [PATCH 01/78] Implement BaseDtypeTests for ArrowStringDtype

---
 pandas/core/arrays/base.py                  |   6 +-
 pandas/core/arrays/string_arrow.py          | 254 ++++++++++++++++++++
 pandas/tests/extension/test_string_arrow.py | 127 ++++++++++
 setup.py                                    |   2 +-
 4 files changed, 387 insertions(+), 2 deletions(-)
 create mode 100644 pandas/core/arrays/string_arrow.py
 create mode 100644 pandas/tests/extension/test_string_arrow.py

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 32a2a30fcfd43..b987a30a0ecd2 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -452,9 +452,13 @@ def astype(self, dtype, copy=True):
             NumPy ndarray with 'dtype' for its dtype.
         """
         from pandas.core.arrays.string_ import StringDtype
+        from pandas.core.arrays.string_arrow import ArrowStringDtype
 
         dtype = pandas_dtype(dtype)
-        if isinstance(dtype, StringDtype):  # allow conversion to StringArrays
+        # FIXME: Really hard-code here?
+        if isinstance(
+            dtype, (ArrowStringDtype, StringDtype)
+        ):  # allow conversion to StringArrays
             return dtype.construct_array_type()._from_sequence(self, copy=False)
 
         return np.array(self, dtype=dtype, copy=copy)
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
new file mode 100644
index 0000000000000..f9670d3f0da5f
--- /dev/null
+++ b/pandas/core/arrays/string_arrow.py
@@ -0,0 +1,254 @@
+from typing import TYPE_CHECKING, Tuple, Type, Union
+
+import pyarrow as pa
+
+from pandas._libs import missing as libmissing
+
+from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.dtypes import register_extension_dtype
+
+from pandas.core.arrays.base import ExtensionArray
+
+if TYPE_CHECKING:
+    import numpy as np
+
+
+@register_extension_dtype
+class ArrowStringDtype(ExtensionDtype):
+    """
+    Extension dtype for string data in a ``pyarrow.ChunkedArray``.
+
+    .. versionadded:: 1.1.0
+
+    .. warning::
+
+       ArrowStringDtype is considered experimental. The implementation and
+       parts of the API may change without warning.
+
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
+    Examples
+    --------
+    >>> pd.ArrowStringDtype()
+    ArrowStringDtype
+    """
+
+    name = "arrow_string"
+
+    #: StringDtype.na_value uses pandas.NA
+    na_value = libmissing.NA
+
+    @property
+    def type(self) -> Type[str]:
+        return str
+
+    @classmethod
+    def construct_array_type(cls) -> Type["ArrowStringArray"]:
+        """
+        Return the array type associated with this dtype.
+
+        Returns
+        -------
+        type
+        """
+        return ArrowStringArray
+
+    def __hash__(self) -> int:
+        return hash("ArrowStringDtype")
+
+    def __repr__(self) -> str:
+        return "ArrowStringDtype"
+
+    def __from_arrow__(
+        self, array: Union["pa.Array", "pa.ChunkedArray"]
+    ) -> "ArrowStringArray":
+        """
+        Construct StringArray from pyarrow Array/ChunkedArray.
+        """
+        return ArrowStringArray(array)
+
+    def __eq__(self, other) -> bool:
+        """Check whether 'other' is equal to self.
+
+        By default, 'other' is considered equal if
+        * it's a string matching 'self.name'.
+        * it's an instance of this type.
+
+        Parameters
+        ----------
+        other : Any
+
+        Returns
+        -------
+        bool
+        """
+        if isinstance(other, ArrowStringDtype):
+            return True
+        elif isinstance(other, str) and other == "arrow_string":
+            return True
+        else:
+            return False
+
+
+class ArrowStringArray(ExtensionArray):
+    """
+    Extension array for string data in a ``pyarrow.ChunkedArray``.
+
+    .. versionadded:: 1.1.0
+
+    .. warning::
+
+       ArrowStringArray is considered experimental. The implementation and
+       parts of the API may change without warning.
+
+    Parameters
+    ----------
+    values : pyarrow.Array or pyarrow.ChunkedArray
+        The array of data.
+
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
+    See Also
+    --------
+    array
+        The recommended function for creating a ArrowStringArray.
+    Series.str
+        The string methods are available on Series backed by
+        a ArrowStringArray.
+
+    Notes
+    -----
+    ArrowStringArray returns a BooleanArray for comparison methods.
+
+    Examples
+    --------
+    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string")
+    <ArrowStringArray>
+    ['This is', 'some text', <NA>, 'data.']
+    Length: 4, dtype: arrow_string
+    """
+
+    def __init__(self, values):
+        if isinstance(values, pa.Array):
+            self.data = pa.chunked_array([values])
+        elif isinstance(values, pa.ChunkedArray):
+            self.data = values
+        else:
+            raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
+
+    @classmethod
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        return cls(pa.array(scalars, type=pa.string()))
+
+    @property
+    def dtype(self) -> ArrowStringDtype:
+        """
+        An instance of 'ArrowStringDtype'.
+        """
+        return ArrowStringDtype()
+
+    def __array__(self, *args, **kwargs) -> "np.ndarray":
+        """Correctly construct numpy arrays when passed to `np.asarray()`."""
+        return self.data.__array__(*args, **kwargs)
+
+    def __arrow_array__(self, type=None):
+        """Convert myself to a pyarrow Array or ChunkedArray."""
+        return self.data
+
+    @property
+    def size(self) -> int:
+        """
+        Return the number of elements in this array.
+
+        Returns
+        -------
+        size : int
+        """
+        return len(self.data)
+
+    @property
+    def shape(self) -> Tuple[int]:
+        """Return the shape of the data."""
+        # This may be patched by pandas to support pseudo-2D operations.
+        return (len(self.data),)
+
+    @property
+    def ndim(self) -> int:
+        """Return the number of dimensions of the underlying data."""
+        return 1
+
+    def __len__(self) -> int:
+        """
+        Length of this array.
+
+        Returns
+        -------
+        length : int
+        """
+        return len(self.data)
+
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        return cls._from_sequence(strings, dtype=dtype, copy=copy)
+
+    #     def _values_for_factorize(self):
+    #         arr = self._ndarray.copy()
+    #         mask = self.isna()
+    #         arr[mask] = -1
+    #         return arr, -1
+
+    def __setitem__(self, key, value):
+        raise NotImplementedError("__setitem__")
+
+    def fillna(self, value=None, method=None, limit=None):
+        raise NotImplementedError("fillna")
+
+    #     def astype(self, dtype, copy=True):
+    #         dtype = pandas_dtype(dtype)
+    #         if isinstance(dtype, StringDtype):
+    #             if copy:
+    #                 return self.copy()
+    #             return self
+    #         elif isinstance(dtype, _IntegerDtype):
+    #             arr = self._ndarray.copy()
+    #             mask = self.isna()
+    #             arr[mask] = 0
+    #             values = arr.astype(dtype.numpy_dtype)
+    #             return IntegerArray(values, mask, copy=False)
+
+    #         return super().astype(dtype, copy)
+
+    def _reduce(self, name, skipna=True, **kwargs):
+        if name in ["min", "max"]:
+            return getattr(self, name)(skipna=skipna)
+
+        raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
+
+    #     def value_counts(self, dropna=False):
+    #         from pandas import value_counts
+
+    #         return value_counts(self._ndarray, dropna=dropna).astype("Int64")
+
+    @property
+    def nbytes(self) -> int:
+        """
+        The number of bytes needed to store this object in memory.
+        """
+        size = 0
+        for chunk in self.data.chunks:
+            for buf in chunk.buffers():
+                if buf is not None:
+                    size += buf.size
+        return size
diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
new file mode 100644
index 0000000000000..756d2dd0a739f
--- /dev/null
+++ b/pandas/tests/extension/test_string_arrow.py
@@ -0,0 +1,127 @@
+import string
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype
+from pandas.tests.extension import base
+
+
+@pytest.fixture
+def dtype():
+    return ArrowStringDtype()
+
+
+@pytest.fixture
+def data():
+    strings = np.random.choice(list(string.ascii_letters), size=100)
+    while strings[0] == strings[1]:
+        strings = np.random.choice(list(string.ascii_letters), size=100)
+
+    return ArrowStringArray._from_sequence(strings)
+
+
+@pytest.fixture
+def data_missing():
+    """Length 2 array with [NA, Valid]"""
+    # TODO(ARROW-9407): Accept pd.NA in Arrow
+    return ArrowStringArray._from_sequence([pd.NA, "A"])
+
+
+@pytest.fixture
+def data_for_sorting():
+    return ArrowStringArray._from_sequence(["B", "C", "A"])
+
+
+@pytest.fixture
+def data_missing_for_sorting():
+    # TODO(ARROW-9407): Accept pd.NA in Arrow
+    return ArrowStringArray._from_sequence(["B", pd.NA, "A"])
+
+
+@pytest.fixture
+def na_value():
+    # TODO(ARROW-9407): Accept pd.NA in Arrow
+    return pd.NA
+
+
+@pytest.fixture
+def data_for_grouping():
+    # TODO(ARROW-9407): Accept pd.NA in Arrow
+    return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"])
+
+
+class TestDtype(base.BaseDtypeTests):
+    pass
+
+
+class TestInterface(base.BaseInterfaceTests):
+    pass
+
+
+# class TestConstructors(base.BaseConstructorsTests):
+#     pass
+
+
+#  class TestReshaping(base.BaseReshapingTests):
+#     pass
+
+
+# class TestGetitem(base.BaseGetitemTests):
+#     pass
+
+
+# class TestSetitem(base.BaseSetitemTests):
+#     pass
+
+
+# class TestMissing(base.BaseMissingTests):
+#     pass
+
+
+# class TestNoReduce(base.BaseNoReduceTests):
+#     @pytest.mark.parametrize("skipna", [True, False])
+#     def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
+#         op_name = all_numeric_reductions
+#
+#         if op_name in ["min", "max"]:
+#             return None
+#
+#         s = pd.Series(data)
+#         with pytest.raises(TypeError):
+#             getattr(s, op_name)(skipna=skipna)
+
+
+# class TestMethods(base.BaseMethodsTests):
+#     @pytest.mark.skip(reason="returns nullable")
+#     def test_value_counts(self, all_data, dropna):
+#         return super().test_value_counts(all_data, dropna)
+
+
+# class TestCasting(base.BaseCastingTests):
+#     pass
+
+
+# class TestComparisonOps(base.BaseComparisonOpsTests):
+#     def _compare_other(self, s, data, op_name, other):
+#         result = getattr(s, op_name)(other)
+#         expected = getattr(s.astype(object), op_name)(other).astype("boolean")
+#         self.assert_series_equal(result, expected)
+
+#     def test_compare_scalar(self, data, all_compare_operators):
+#         op_name = all_compare_operators
+#         s = pd.Series(data)
+#         self._compare_other(s, data, op_name, "abc")
+
+
+# class TestParsing(base.BaseParsingTests):
+#     pass
+
+
+# class TestPrinting(base.BasePrintingTests):
+#     pass
+
+
+# class TestGroupBy(base.BaseGroupbyTests):
+#     pass
diff --git a/setup.py b/setup.py
index 1885546e001fe..d83092514aca8 100755
--- a/setup.py
+++ b/setup.py
@@ -434,7 +434,7 @@ def run(self):
         extra_compile_args.append("/Z7")
         extra_link_args.append("/DEBUG")
 else:
-    extra_compile_args = ["-Werror"]
+    extra_compile_args = []
     extra_link_args = []
     if debugging_symbols_requested:
         extra_compile_args.append("-g")

From d477ee7520afb5f5606967ec0caaa5cf2a6e1730 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Mon, 13 Jul 2020 12:07:06 +0200
Subject: [PATCH 02/78] Implement getitem

---
 pandas/core/arrays/string_arrow.py          | 76 +++++++++++++++++++--
 pandas/tests/extension/test_string_arrow.py |  4 --
 2 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index f9670d3f0da5f..28b6231fdb516 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,5 +1,7 @@
-from typing import TYPE_CHECKING, Tuple, Type, Union
+from collections.abc import Iterable
+from typing import Tuple, Type, Union
 
+import numpy as np
 import pyarrow as pa
 
 from pandas._libs import missing as libmissing
@@ -7,10 +9,10 @@
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.dtypes import register_extension_dtype
 
+import pandas as pd
+from pandas.api.types import is_integer
 from pandas.core.arrays.base import ExtensionArray
-
-if TYPE_CHECKING:
-    import numpy as np
+from pandas.core.indexers import check_array_indexer
 
 
 @register_extension_dtype
@@ -150,7 +152,9 @@ def __init__(self, values):
 
     @classmethod
     def _from_sequence(cls, scalars, dtype=None, copy=False):
-        return cls(pa.array(scalars, type=pa.string()))
+        # TODO(ARROW-9407): Accept pd.NA in Arrow
+        scalars_corrected = [None if pd.isna(x) else x for x in scalars]
+        return cls(pa.array(scalars_corrected, type=pa.string()))
 
     @property
     def dtype(self) -> ArrowStringDtype:
@@ -209,6 +213,60 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
     #         arr[mask] = -1
     #         return arr, -1
 
+    def __getitem__(self, item):
+        # type (Any) -> Any
+        """Select a subset of self.
+
+        Parameters
+        ----------
+        item : int, slice, or ndarray
+            * int: The position in 'self' to get.
+            * slice: A slice object, where 'start', 'stop', and 'step' are
+              integers or None
+            * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
+
+        Returns
+        -------
+        item : scalar or ExtensionArray
+
+        Notes
+        -----
+        For scalar ``item``, return a scalar value suitable for the array's
+        type. This should be an instance of ``self.dtype.type``.
+        For slice ``key``, return an instance of ``ExtensionArray``, even
+        if the slice is length 0 or 1.
+        For a boolean mask, return an instance of ``ExtensionArray``, filtered
+        to the values where ``item`` is True.
+        """
+        item = check_array_indexer(self, item)
+
+        if isinstance(item, Iterable):
+            raise NotImplementedError("Iterable")
+            # if not is_array_like(item):
+            #     item = np.array(item)
+            # if is_integer_dtype(item) or (len(item) == 0):
+            #     return self.take(item)
+            # elif is_bool_dtype(item):
+            #     indices = np.array(item)
+            #     indices = np.argwhere(indices).flatten()
+            #     return self.take(indices)
+            # else:
+            #     raise IndexError(
+            #         """Only integers, slices and integer or
+            #            boolean arrays are valid indices."""
+            #     )
+        elif is_integer(item):
+            if item < 0:
+                item += len(self)
+            if item >= len(self):
+                return None
+
+        value = self.data[item]
+        if isinstance(value, pa.ChunkedArray):
+            return type(self)(value)
+        else:
+            return value.as_py()
+
     def __setitem__(self, key, value):
         raise NotImplementedError("__setitem__")
 
@@ -252,3 +310,11 @@ def nbytes(self) -> int:
                 if buf is not None:
                     size += buf.size
         return size
+
+    def isna(self) -> np.ndarray:
+        """
+        Boolean NumPy array indicating if each value is missing.
+
+        This should return a 1-D array the same length as 'self'.
+        """
+        return self.data.is_null()
diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
index 756d2dd0a739f..208a79e7be460 100644
--- a/pandas/tests/extension/test_string_arrow.py
+++ b/pandas/tests/extension/test_string_arrow.py
@@ -25,7 +25,6 @@ def data():
 @pytest.fixture
 def data_missing():
     """Length 2 array with [NA, Valid]"""
-    # TODO(ARROW-9407): Accept pd.NA in Arrow
     return ArrowStringArray._from_sequence([pd.NA, "A"])
 
 
@@ -36,19 +35,16 @@ def data_for_sorting():
 
 @pytest.fixture
 def data_missing_for_sorting():
-    # TODO(ARROW-9407): Accept pd.NA in Arrow
     return ArrowStringArray._from_sequence(["B", pd.NA, "A"])
 
 
 @pytest.fixture
 def na_value():
-    # TODO(ARROW-9407): Accept pd.NA in Arrow
     return pd.NA
 
 
 @pytest.fixture
 def data_for_grouping():
-    # TODO(ARROW-9407): Accept pd.NA in Arrow
     return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"])
 
 

From 206f4930afbaa010436583a935b4b05205953fcc Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Mon, 13 Jul 2020 12:10:26 +0200
Subject: [PATCH 03/78] Add basic copy implementation

---
 pandas/core/arrays/string_arrow.py          | 16 ++++++++++++++++
 pandas/tests/extension/test_string_arrow.py |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 28b6231fdb516..65e79da13bc99 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -318,3 +318,19 @@ def isna(self) -> np.ndarray:
         This should return a 1-D array the same length as 'self'.
         """
         return self.data.is_null()
+
+    def copy(self):
+        # type: () -> ExtensionArray
+        """
+        Return a copy of the array.
+
+        Parameters
+        ----------
+        deep : bool, default False
+            Also copy the underlying data backing this array.
+
+        Returns
+        -------
+        ExtensionArray
+        """
+        return type(self)(self.data)
diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
index 208a79e7be460..e94ffcd44e3e9 100644
--- a/pandas/tests/extension/test_string_arrow.py
+++ b/pandas/tests/extension/test_string_arrow.py
@@ -56,8 +56,8 @@ class TestInterface(base.BaseInterfaceTests):
     pass
 
 
-# class TestConstructors(base.BaseConstructorsTests):
-#     pass
+class TestConstructors(base.BaseConstructorsTests):
+    pass
 
 
 #  class TestReshaping(base.BaseReshapingTests):

From d58dba6cf40caed93d43460d050117e3ec766989 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Mon, 13 Jul 2020 12:17:38 +0200
Subject: [PATCH 04/78] Implement getitem for iterables

---
 pandas/core/arrays/string_arrow.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 65e79da13bc99..1c553c1778b15 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -10,7 +10,7 @@
 from pandas.core.dtypes.dtypes import register_extension_dtype
 
 import pandas as pd
-from pandas.api.types import is_integer
+from pandas.api.types import is_array_like, is_bool_dtype, is_integer, is_integer_dtype
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import check_array_indexer
 
@@ -241,20 +241,19 @@ def __getitem__(self, item):
         item = check_array_indexer(self, item)
 
         if isinstance(item, Iterable):
-            raise NotImplementedError("Iterable")
-            # if not is_array_like(item):
-            #     item = np.array(item)
-            # if is_integer_dtype(item) or (len(item) == 0):
-            #     return self.take(item)
-            # elif is_bool_dtype(item):
-            #     indices = np.array(item)
-            #     indices = np.argwhere(indices).flatten()
-            #     return self.take(indices)
-            # else:
-            #     raise IndexError(
-            #         """Only integers, slices and integer or
-            #            boolean arrays are valid indices."""
-            #     )
+            if not is_array_like(item):
+                item = np.array(item)
+            if len(item) == 0:
+                return type(self)(pa.chunked_array([], type=pa.string()))
+            elif is_integer_dtype(item):
+                return self.take(item)
+            elif is_bool_dtype(item):
+                return type(self)(self.data.filter(item))
+            else:
+                raise IndexError(
+                    "Only integers, slices and integer or "
+                    "boolean arrays are valid indices."
+                )
         elif is_integer(item):
             if item < 0:
                 item += len(self)

From 7a9e2c3a40e4103ae2353a2d7af717a41eeb1ff6 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Mon, 13 Jul 2020 12:19:37 +0200
Subject: [PATCH 05/78] Remove commented code

---
 pandas/core/arrays/string_arrow.py | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 1c553c1778b15..b9d60b5034ae3 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -207,12 +207,6 @@ def __len__(self) -> int:
     def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
         return cls._from_sequence(strings, dtype=dtype, copy=copy)
 
-    #     def _values_for_factorize(self):
-    #         arr = self._ndarray.copy()
-    #         mask = self.isna()
-    #         arr[mask] = -1
-    #         return arr, -1
-
     def __getitem__(self, item):
         # type (Any) -> Any
         """Select a subset of self.
@@ -272,32 +266,12 @@ def __setitem__(self, key, value):
     def fillna(self, value=None, method=None, limit=None):
         raise NotImplementedError("fillna")
 
-    #     def astype(self, dtype, copy=True):
-    #         dtype = pandas_dtype(dtype)
-    #         if isinstance(dtype, StringDtype):
-    #             if copy:
-    #                 return self.copy()
-    #             return self
-    #         elif isinstance(dtype, _IntegerDtype):
-    #             arr = self._ndarray.copy()
-    #             mask = self.isna()
-    #             arr[mask] = 0
-    #             values = arr.astype(dtype.numpy_dtype)
-    #             return IntegerArray(values, mask, copy=False)
-
-    #         return super().astype(dtype, copy)
-
     def _reduce(self, name, skipna=True, **kwargs):
         if name in ["min", "max"]:
             return getattr(self, name)(skipna=skipna)
 
         raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
 
-    #     def value_counts(self, dropna=False):
-    #         from pandas import value_counts
-
-    #         return value_counts(self._ndarray, dropna=dropna).astype("Int64")
-
     @property
     def nbytes(self) -> int:
         """

From ffc4c0f70c0aaa520b3ca81fb7da938c6172ac92 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Mon, 13 Jul 2020 13:44:24 +0200
Subject: [PATCH 06/78] Implement more Setitem/Getitem variants

---
 pandas/core/arrays/string_arrow.py          | 169 +++++++++++++++++++-
 pandas/tests/extension/test_string_arrow.py |  12 +-
 2 files changed, 169 insertions(+), 12 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index b9d60b5034ae3..29f369b117948 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,16 +1,23 @@
 from collections.abc import Iterable
-from typing import Tuple, Type, Union
+from typing import Any, Sequence, Tuple, Type, Union
 
 import numpy as np
 import pyarrow as pa
 
 from pandas._libs import missing as libmissing
+from pandas._typing import ArrayLike
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.dtypes import register_extension_dtype
 
 import pandas as pd
-from pandas.api.types import is_array_like, is_bool_dtype, is_integer, is_integer_dtype
+from pandas.api.types import (
+    is_array_like,
+    is_bool_dtype,
+    is_integer,
+    is_integer_dtype,
+    is_scalar,
+)
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import check_array_indexer
 
@@ -260,9 +267,6 @@ def __getitem__(self, item):
         else:
             return value.as_py()
 
-    def __setitem__(self, key, value):
-        raise NotImplementedError("__setitem__")
-
     def fillna(self, value=None, method=None, limit=None):
         raise NotImplementedError("fillna")
 
@@ -292,8 +296,7 @@ def isna(self) -> np.ndarray:
         """
         return self.data.is_null()
 
-    def copy(self):
-        # type: () -> ExtensionArray
+    def copy(self) -> ExtensionArray:
         """
         Return a copy of the array.
 
@@ -307,3 +310,155 @@ def copy(self):
         ExtensionArray
         """
         return type(self)(self.data)
+
+    def __eq__(self, other: Any) -> ArrayLike:
+        """
+        Return for `self == other` (element-wise equality).
+        """
+        if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
+            return NotImplemented
+        if isinstance(other, ArrowStringArray):
+            result = self.data == other.data
+        elif is_scalar(other):
+            result = self.data == pa.scalar(other)
+        else:
+            raise NotImplementedError("Neither scalar nor ArrowStringArray")
+
+        # TODO: Add a .to_numpy() to ChunkedArray
+        return pd.array(result.to_pandas().values)
+
+    def __setitem__(self, key, value):
+        # type: (Union[int, np.ndarray], Any) -> None
+        """Set one or more values inplace.
+
+        Parameters
+        ----------
+        key : int, ndarray, or slice
+            When called from, e.g. ``Series.__setitem__``, ``key`` will be
+            one of
+
+            * scalar int
+            * ndarray of integers.
+            * boolean ndarray
+            * slice object
+
+        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
+            value or values to be set of ``key``.
+
+        Returns
+        -------
+        None
+        """
+        key = check_array_indexer(self, key)
+
+        if is_integer(key):
+            if not pd.api.types.is_scalar(value):
+                raise ValueError("Must pass scalars with scalar indexer")
+            elif pd.isna(value):
+                value = None
+            elif not isinstance(value, str):
+                raise ValueError("Scalar must be NA or str")
+
+            # Slice data and insert inbetween
+            new_data = [
+                *self.data[0:key].chunks,
+                pa.array([value], type=pa.string()),
+                *self.data[(key + 1) :].chunks,
+            ]
+            self.data = pa.chunked_array(new_data)
+        else:
+            # Convert to integer indices and iteratively assign.
+            # TODO: Make a faster variant of this in Arrow upstream.
+            #       This is probably extremely slow.
+
+            # Convert all possible input key types to an array of integers
+            if is_bool_dtype(key):
+                key_array = np.argwhere(key).flatten()
+            elif isinstance(key, slice):
+                key_array = np.array(range(len(self))[key])
+            else:
+                key_array = np.asanyarray(key)
+
+            if pd.api.types.is_scalar(value):
+                value = np.broadcast_to(value, len(key_array))
+            else:
+                value = np.asarray(value)
+
+            if len(key_array) != len(value):
+                raise ValueError("Length of indexer and values mismatch")
+
+            for k, v in zip(key_array, value):
+                self[k] = v
+
+    def take(
+        self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None
+    ) -> "ExtensionArray":
+        """
+        Take elements from an array.
+
+        Parameters
+        ----------
+        indices : sequence of int
+            Indices to be taken.
+        allow_fill : bool, default False
+            How to handle negative values in `indices`.
+
+            * False: negative values in `indices` indicate positional indices
+              from the right (the default). This is similar to
+              :func:`numpy.take`.
+
+            * True: negative values in `indices` indicate
+              missing values. These values are set to `fill_value`. Any other
+              other negative values raise a ``ValueError``.
+
+        fill_value : any, optional
+            Fill value to use for NA-indices when `allow_fill` is True.
+            This may be ``None``, in which case the default NA value for
+            the type, ``self.dtype.na_value``, is used.
+
+            For many ExtensionArrays, there will be two representations of
+            `fill_value`: a user-facing "boxed" scalar, and a low-level
+            physical NA value. `fill_value` should be the user-facing version,
+            and the implementation should handle translating that to the
+            physical version for processing the take if necessary.
+
+        Returns
+        -------
+        ExtensionArray
+
+        Raises
+        ------
+        IndexError
+            When the indices are out of bounds for the array.
+        ValueError
+            When `indices` contains negative values other than ``-1``
+            and `allow_fill` is True.
+
+        See Also
+        --------
+        numpy.take
+        api.extensions.take
+
+        Notes
+        -----
+        ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
+        ``iloc``, when `indices` is a sequence of values. Additionally,
+        it's called by :meth:`Series.reindex`, or any other method
+        that causes realignment, with a `fill_value`.
+        """
+        # TODO: Remove once we got rid of the (indices < 0) check
+        if not is_array_like(indices):
+            indices_array = np.asanyarray(indices)
+        else:
+            indices_array = indices
+
+        if allow_fill:
+            if (indices_array < 0).any():
+                raise NotImplementedError("allow_fill=True")
+            else:
+                # Nothing to fill
+                return type(self)(self.data.take(indices))
+        else:  # allow_fill=False
+            if (indices_array < 0).any():
+                raise NotImplementedError("negative indices")
+            return type(self)(self.data.take(indices))
diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
index e94ffcd44e3e9..437d51060fb7f 100644
--- a/pandas/tests/extension/test_string_arrow.py
+++ b/pandas/tests/extension/test_string_arrow.py
@@ -53,7 +53,9 @@ class TestDtype(base.BaseDtypeTests):
 
 
 class TestInterface(base.BaseInterfaceTests):
-    pass
+    @pytest.mark.xfail(reason="Fails until implement, remove before merge")
+    def test_view(self, data):
+        base.BaseInterfaceTests.test_view(self, data)
 
 
 class TestConstructors(base.BaseConstructorsTests):
@@ -64,12 +66,12 @@ class TestConstructors(base.BaseConstructorsTests):
 #     pass
 
 
-# class TestGetitem(base.BaseGetitemTests):
-#     pass
+class TestGetitem(base.BaseGetitemTests):
+    pass
 
 
-# class TestSetitem(base.BaseSetitemTests):
-#     pass
+class TestSetitem(base.BaseSetitemTests):
+    pass
 
 
 # class TestMissing(base.BaseMissingTests):

From c1305ab833db1bed89929b6f040edbf82f63c54c Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Mon, 13 Jul 2020 13:54:41 +0200
Subject: [PATCH 07/78] Review comments by @jorisvandenbossche

---
 pandas/core/arrays/string_arrow.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 29f369b117948..29f39084fab3e 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,5 +1,5 @@
 from collections.abc import Iterable
-from typing import Any, Sequence, Tuple, Type, Union
+from typing import Any, Optional, Sequence, Tuple, Type, Union
 
 import numpy as np
 import pyarrow as pa
@@ -22,6 +22,14 @@
 from pandas.core.indexers import check_array_indexer
 
 
+def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]:
+    scalar = arrow_scalar.as_py()
+    if scalar is None:
+        return libmissing.NA
+    else:
+        return scalar
+
+
 @register_extension_dtype
 class ArrowStringDtype(ExtensionDtype):
     """
@@ -259,13 +267,13 @@ def __getitem__(self, item):
             if item < 0:
                 item += len(self)
             if item >= len(self):
-                return None
+                raise IndexError("index out of bounds")
 
         value = self.data[item]
         if isinstance(value, pa.ChunkedArray):
             return type(self)(value)
         else:
-            return value.as_py()
+            return _as_pandas_scalar(value)
 
     def fillna(self, value=None, method=None, limit=None):
         raise NotImplementedError("fillna")
@@ -281,12 +289,7 @@ def nbytes(self) -> int:
         """
         The number of bytes needed to store this object in memory.
         """
-        size = 0
-        for chunk in self.data.chunks:
-            for buf in chunk.buffers():
-                if buf is not None:
-                    size += buf.size
-        return size
+        return self.data.nbytes
 
     def isna(self) -> np.ndarray:
         """
@@ -294,7 +297,8 @@ def isna(self) -> np.ndarray:
 
         This should return a 1-D array the same length as 'self'.
         """
-        return self.data.is_null()
+        # TODO: Implement .to_numpy for ChunkedArray
+        return self.data.is_null().to_pandas().values
 
     def copy(self) -> ExtensionArray:
         """

From 13a42f74a62b0ab37b35c1346306b0f45a4479eb Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Mon, 13 Jul 2020 14:20:06 +0200
Subject: [PATCH 08/78] Add Arrow issue numbers

---
 pandas/core/arrays/string_arrow.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 29f39084fab3e..bf2f07ed637fd 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -328,7 +328,7 @@ def __eq__(self, other: Any) -> ArrayLike:
         else:
             raise NotImplementedError("Neither scalar nor ArrowStringArray")
 
-        # TODO: Add a .to_numpy() to ChunkedArray
+        # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray
         return pd.array(result.to_pandas().values)
 
     def __setitem__(self, key, value):
@@ -377,10 +377,12 @@ def __setitem__(self, key, value):
 
             # Convert all possible input key types to an array of integers
             if is_bool_dtype(key):
+                # TODO(ARROW-9430): Directly support setitem(booleans)
                 key_array = np.argwhere(key).flatten()
             elif isinstance(key, slice):
                 key_array = np.array(range(len(self))[key])
             else:
+                # TODO(ARROW-9431): Directly support setitem(integers)
                 key_array = np.asanyarray(key)
 
             if pd.api.types.is_scalar(value):
@@ -458,11 +460,13 @@ def take(
 
         if allow_fill:
             if (indices_array < 0).any():
+                # TODO(ARROW-9433): Treat negative indices as NULL
                 raise NotImplementedError("allow_fill=True")
             else:
                 # Nothing to fill
                 return type(self)(self.data.take(indices))
         else:  # allow_fill=False
             if (indices_array < 0).any():
+                # TODO(ARROW-9432): Treat negative indices as indices from the right.
                 raise NotImplementedError("negative indices")
             return type(self)(self.data.take(indices))

From decd0220a10f3735c0106efc870276f7af0c2c96 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Wed, 15 Jul 2020 16:32:47 +0200
Subject: [PATCH 09/78] Adopt to kernel renamings

---
 pandas/core/arrays/string_arrow.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index bf2f07ed637fd..9311e07226366 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pyarrow as pa
+import pyarrow.compute as pc
 
 from pandas._libs import missing as libmissing
 from pandas._typing import ArrayLike
@@ -322,9 +323,9 @@ def __eq__(self, other: Any) -> ArrayLike:
         if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
             return NotImplemented
         if isinstance(other, ArrowStringArray):
-            result = self.data == other.data
+            result = pc.equal(self.data, other.data)
         elif is_scalar(other):
-            result = self.data == pa.scalar(other)
+            result = pc.equal(self.data, pa.scalar(other))
         else:
             raise NotImplementedError("Neither scalar nor ArrowStringArray")
 

From 3145e44d6e8c3a3b4f684cbff332689e5d056c69 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Wed, 15 Jul 2020 16:58:49 +0200
Subject: [PATCH 10/78] Handle take(indices<0, allow_fill=False)

---
 pandas/core/arrays/string_arrow.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 9311e07226366..402f17772a572 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -467,7 +467,9 @@ def take(
                 # Nothing to fill
                 return type(self)(self.data.take(indices))
         else:  # allow_fill=False
+            # TODO(ARROW-9432): Treat negative indices as indices from the right.
             if (indices_array < 0).any():
-                # TODO(ARROW-9432): Treat negative indices as indices from the right.
-                raise NotImplementedError("negative indices")
-            return type(self)(self.data.take(indices))
+                # Don't modify in-place
+                indices_array = np.copy(indices_array)
+                indices_array[indices_array < 0] += len(self.data)
+            return type(self)(self.data.take(indices_array))

From e22b3481657cc7b5ba1dcc075c8e4d2effa7f0b6 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Wed, 15 Jul 2020 17:17:52 +0200
Subject: [PATCH 11/78] Handle fill_value better

---
 pandas/core/arrays/string_arrow.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 402f17772a572..8248a3e91c0fe 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -459,10 +459,19 @@ def take(
         else:
             indices_array = indices
 
+        if len(self.data) == 0 and (indices_array >= 0).any():
+            raise IndexError("cannot do a non-empty take")
+        if indices_array.max() >= len(self.data):
+            raise IndexError("out of bounds value in 'indices'.")
+
         if allow_fill:
             if (indices_array < 0).any():
                 # TODO(ARROW-9433): Treat negative indices as NULL
-                raise NotImplementedError("allow_fill=True")
+                indices_array = pa.array(indices_array, mask=indices_array < 0)
+                result = self.data.take(indices_array)
+                if pd.isna(fill_value):
+                    return type(self)(result)
+                return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
             else:
                 # Nothing to fill
                 return type(self)(self.data.take(indices))

From 2446562047018793f7d0c445c904c3abcd06be18 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 19 Oct 2020 16:33:37 +0100
Subject: [PATCH 12/78] fix doctest

---
 pandas/__init__.py                 |  1 +
 pandas/core/api.py                 |  2 ++
 pandas/core/arrays/string_arrow.py | 43 ++++++++++++++++++------------
 3 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/pandas/__init__.py b/pandas/__init__.py
index cf7ae2505b72d..a4e9e04560241 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -65,6 +65,7 @@
     IntervalDtype,
     DatetimeTZDtype,
     StringDtype,
+    ArrowStringDtype,
     BooleanDtype,
     # missing
     NA,
diff --git a/pandas/core/api.py b/pandas/core/api.py
index 67e86c2076329..d8210d114e213 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -57,3 +57,5 @@
 
 # DataFrame needs to be imported after NamedAgg to avoid a circular import
 from pandas.core.frame import DataFrame  # isort:skip
+
+from pandas.core.arrays.string_arrow import ArrowStringDtype  # isort:skip
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 8248a3e91c0fe..7b8275c1d0f9a 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,9 +1,9 @@
+from __future__ import annotations
+
 from collections.abc import Iterable
-from typing import Any, Optional, Sequence, Tuple, Type, Union
+from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, Union
 
 import numpy as np
-import pyarrow as pa
-import pyarrow.compute as pc
 
 from pandas._libs import missing as libmissing
 from pandas._typing import ArrayLike
@@ -22,6 +22,9 @@
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import check_array_indexer
 
+if TYPE_CHECKING:
+    import pyarrow as pa
+
 
 def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]:
     scalar = arrow_scalar.as_py()
@@ -36,7 +39,7 @@ class ArrowStringDtype(ExtensionDtype):
     """
     Extension dtype for string data in a ``pyarrow.ChunkedArray``.
 
-    .. versionadded:: 1.1.0
+    .. versionadded:: 1.2.0
 
     .. warning::
 
@@ -57,6 +60,9 @@ class ArrowStringDtype(ExtensionDtype):
     ArrowStringDtype
     """
 
+    import pyarrow as pa
+    import pyarrow.compute as pc
+
     name = "arrow_string"
 
     #: StringDtype.na_value uses pandas.NA
@@ -118,7 +124,7 @@ class ArrowStringArray(ExtensionArray):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
 
-    .. versionadded:: 1.1.0
+    .. versionadded:: 1.2.0
 
     .. warning::
 
@@ -158,10 +164,13 @@ class ArrowStringArray(ExtensionArray):
     Length: 4, dtype: arrow_string
     """
 
+    import pyarrow as pa
+    import pyarrow.compute as pc
+
     def __init__(self, values):
-        if isinstance(values, pa.Array):
-            self.data = pa.chunked_array([values])
-        elif isinstance(values, pa.ChunkedArray):
+        if isinstance(values, self.pa.Array):
+            self.data = self.pa.chunked_array([values])
+        elif isinstance(values, self.pa.ChunkedArray):
             self.data = values
         else:
             raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
@@ -170,7 +179,7 @@ def __init__(self, values):
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         # TODO(ARROW-9407): Accept pd.NA in Arrow
         scalars_corrected = [None if pd.isna(x) else x for x in scalars]
-        return cls(pa.array(scalars_corrected, type=pa.string()))
+        return cls(cls.pa.array(scalars_corrected, type=cls.pa.string()))
 
     @property
     def dtype(self) -> ArrowStringDtype:
@@ -254,7 +263,7 @@ def __getitem__(self, item):
             if not is_array_like(item):
                 item = np.array(item)
             if len(item) == 0:
-                return type(self)(pa.chunked_array([], type=pa.string()))
+                return type(self)(self.pa.chunked_array([], type=self.pa.string()))
             elif is_integer_dtype(item):
                 return self.take(item)
             elif is_bool_dtype(item):
@@ -271,7 +280,7 @@ def __getitem__(self, item):
                 raise IndexError("index out of bounds")
 
         value = self.data[item]
-        if isinstance(value, pa.ChunkedArray):
+        if isinstance(value, self.pa.ChunkedArray):
             return type(self)(value)
         else:
             return _as_pandas_scalar(value)
@@ -323,9 +332,9 @@ def __eq__(self, other: Any) -> ArrayLike:
         if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
             return NotImplemented
         if isinstance(other, ArrowStringArray):
-            result = pc.equal(self.data, other.data)
+            result = self.pc.equal(self.data, other.data)
         elif is_scalar(other):
-            result = pc.equal(self.data, pa.scalar(other))
+            result = self.pc.equal(self.data, self.pa.scalar(other))
         else:
             raise NotImplementedError("Neither scalar nor ArrowStringArray")
 
@@ -367,10 +376,10 @@ def __setitem__(self, key, value):
             # Slice data and insert inbetween
             new_data = [
                 *self.data[0:key].chunks,
-                pa.array([value], type=pa.string()),
+                self.pa.array([value], type=self.pa.string()),
                 *self.data[(key + 1) :].chunks,
             ]
-            self.data = pa.chunked_array(new_data)
+            self.data = self.pa.chunked_array(new_data)
         else:
             # Convert to integer indices and iteratively assign.
             # TODO: Make a faster variant of this in Arrow upstream.
@@ -467,11 +476,11 @@ def take(
         if allow_fill:
             if (indices_array < 0).any():
                 # TODO(ARROW-9433): Treat negative indices as NULL
-                indices_array = pa.array(indices_array, mask=indices_array < 0)
+                indices_array = self.pa.array(indices_array, mask=indices_array < 0)
                 result = self.data.take(indices_array)
                 if pd.isna(fill_value):
                     return type(self)(result)
-                return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
+                return type(self)(self.pc.fill_null(result, self.pa.scalar(fill_value)))
             else:
                 # Nothing to fill
                 return type(self)(self.data.take(indices))

From a0dcc85b0f447e482c54efc21cc82395dabdb677 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 19 Oct 2020 19:45:14 +0100
Subject: [PATCH 13/78] Revert "fix doctest"

This reverts commit 2446562047018793f7d0c445c904c3abcd06be18.
---
 pandas/__init__.py                 |  1 -
 pandas/core/api.py                 |  2 --
 pandas/core/arrays/string_arrow.py | 43 ++++++++++++------------------
 3 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/pandas/__init__.py b/pandas/__init__.py
index a4e9e04560241..cf7ae2505b72d 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -65,7 +65,6 @@
     IntervalDtype,
     DatetimeTZDtype,
     StringDtype,
-    ArrowStringDtype,
     BooleanDtype,
     # missing
     NA,
diff --git a/pandas/core/api.py b/pandas/core/api.py
index d8210d114e213..67e86c2076329 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -57,5 +57,3 @@
 
 # DataFrame needs to be imported after NamedAgg to avoid a circular import
 from pandas.core.frame import DataFrame  # isort:skip
-
-from pandas.core.arrays.string_arrow import ArrowStringDtype  # isort:skip
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 7b8275c1d0f9a..8248a3e91c0fe 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,9 +1,9 @@
-from __future__ import annotations
-
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, Union
+from typing import Any, Optional, Sequence, Tuple, Type, Union
 
 import numpy as np
+import pyarrow as pa
+import pyarrow.compute as pc
 
 from pandas._libs import missing as libmissing
 from pandas._typing import ArrayLike
@@ -22,9 +22,6 @@
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import check_array_indexer
 
-if TYPE_CHECKING:
-    import pyarrow as pa
-
 
 def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]:
     scalar = arrow_scalar.as_py()
@@ -39,7 +36,7 @@ class ArrowStringDtype(ExtensionDtype):
     """
     Extension dtype for string data in a ``pyarrow.ChunkedArray``.
 
-    .. versionadded:: 1.2.0
+    .. versionadded:: 1.1.0
 
     .. warning::
 
@@ -60,9 +57,6 @@ class ArrowStringDtype(ExtensionDtype):
     ArrowStringDtype
     """
 
-    import pyarrow as pa
-    import pyarrow.compute as pc
-
     name = "arrow_string"
 
     #: StringDtype.na_value uses pandas.NA
@@ -124,7 +118,7 @@ class ArrowStringArray(ExtensionArray):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
 
-    .. versionadded:: 1.2.0
+    .. versionadded:: 1.1.0
 
     .. warning::
 
@@ -164,13 +158,10 @@ class ArrowStringArray(ExtensionArray):
     Length: 4, dtype: arrow_string
     """
 
-    import pyarrow as pa
-    import pyarrow.compute as pc
-
     def __init__(self, values):
-        if isinstance(values, self.pa.Array):
-            self.data = self.pa.chunked_array([values])
-        elif isinstance(values, self.pa.ChunkedArray):
+        if isinstance(values, pa.Array):
+            self.data = pa.chunked_array([values])
+        elif isinstance(values, pa.ChunkedArray):
             self.data = values
         else:
             raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
@@ -179,7 +170,7 @@ def __init__(self, values):
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         # TODO(ARROW-9407): Accept pd.NA in Arrow
         scalars_corrected = [None if pd.isna(x) else x for x in scalars]
-        return cls(cls.pa.array(scalars_corrected, type=cls.pa.string()))
+        return cls(pa.array(scalars_corrected, type=pa.string()))
 
     @property
     def dtype(self) -> ArrowStringDtype:
@@ -263,7 +254,7 @@ def __getitem__(self, item):
             if not is_array_like(item):
                 item = np.array(item)
             if len(item) == 0:
-                return type(self)(self.pa.chunked_array([], type=self.pa.string()))
+                return type(self)(pa.chunked_array([], type=pa.string()))
             elif is_integer_dtype(item):
                 return self.take(item)
             elif is_bool_dtype(item):
@@ -280,7 +271,7 @@ def __getitem__(self, item):
                 raise IndexError("index out of bounds")
 
         value = self.data[item]
-        if isinstance(value, self.pa.ChunkedArray):
+        if isinstance(value, pa.ChunkedArray):
             return type(self)(value)
         else:
             return _as_pandas_scalar(value)
@@ -332,9 +323,9 @@ def __eq__(self, other: Any) -> ArrayLike:
         if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
             return NotImplemented
         if isinstance(other, ArrowStringArray):
-            result = self.pc.equal(self.data, other.data)
+            result = pc.equal(self.data, other.data)
         elif is_scalar(other):
-            result = self.pc.equal(self.data, self.pa.scalar(other))
+            result = pc.equal(self.data, pa.scalar(other))
         else:
             raise NotImplementedError("Neither scalar nor ArrowStringArray")
 
@@ -376,10 +367,10 @@ def __setitem__(self, key, value):
             # Slice data and insert inbetween
             new_data = [
                 *self.data[0:key].chunks,
-                self.pa.array([value], type=self.pa.string()),
+                pa.array([value], type=pa.string()),
                 *self.data[(key + 1) :].chunks,
             ]
-            self.data = self.pa.chunked_array(new_data)
+            self.data = pa.chunked_array(new_data)
         else:
             # Convert to integer indices and iteratively assign.
             # TODO: Make a faster variant of this in Arrow upstream.
@@ -476,11 +467,11 @@ def take(
         if allow_fill:
             if (indices_array < 0).any():
                 # TODO(ARROW-9433): Treat negative indices as NULL
-                indices_array = self.pa.array(indices_array, mask=indices_array < 0)
+                indices_array = pa.array(indices_array, mask=indices_array < 0)
                 result = self.data.take(indices_array)
                 if pd.isna(fill_value):
                     return type(self)(result)
-                return type(self)(self.pc.fill_null(result, self.pa.scalar(fill_value)))
+                return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
             else:
                 # Nothing to fill
                 return type(self)(self.data.take(indices))

From 5c4217345246347c655654f8d395e5fe02f35efb Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 19 Oct 2020 19:46:28 +0100
Subject: [PATCH 14/78] change version for versionadded

---
 pandas/core/arrays/string_arrow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 8248a3e91c0fe..1a183b5535cbf 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -36,7 +36,7 @@ class ArrowStringDtype(ExtensionDtype):
     """
     Extension dtype for string data in a ``pyarrow.ChunkedArray``.
 
-    .. versionadded:: 1.1.0
+    .. versionadded:: 1.2.0
 
     .. warning::
 
@@ -118,7 +118,7 @@ class ArrowStringArray(ExtensionArray):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
 
-    .. versionadded:: 1.1.0
+    .. versionadded:: 1.2.0
 
     .. warning::
 

From 28c3ef275728cd5071f9ded151d61cc919a521e7 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 19 Oct 2020 20:35:46 +0100
Subject: [PATCH 15/78] code checks

---
 pandas/core/arrays/string_arrow.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 1a183b5535cbf..98df636f8435a 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,4 +1,4 @@
-from collections.abc import Iterable
+from collections import abc
 from typing import Any, Optional, Sequence, Tuple, Type, Union
 
 import numpy as np
@@ -53,7 +53,8 @@ class ArrowStringDtype(ExtensionDtype):
 
     Examples
     --------
-    >>> pd.ArrowStringDtype()
+    >>> from pandas.core.arrays.string_arrow import ArrowStringDtype
+    >>> ArrowStringDtype()
     ArrowStringDtype
     """
 
@@ -223,8 +224,7 @@ def __len__(self) -> int:
     def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
         return cls._from_sequence(strings, dtype=dtype, copy=copy)
 
-    def __getitem__(self, item):
-        # type (Any) -> Any
+    def __getitem__(self, item: Any) -> Any:
         """Select a subset of self.
 
         Parameters
@@ -250,7 +250,7 @@ def __getitem__(self, item):
         """
         item = check_array_indexer(self, item)
 
-        if isinstance(item, Iterable):
+        if isinstance(item, abc.Iterable):
             if not is_array_like(item):
                 item = np.array(item)
             if len(item) == 0:
@@ -332,8 +332,7 @@ def __eq__(self, other: Any) -> ArrayLike:
         # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray
         return pd.array(result.to_pandas().values)
 
-    def __setitem__(self, key, value):
-        # type: (Union[int, np.ndarray], Any) -> None
+    def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
         """Set one or more values inplace.
 
         Parameters

From 1740524f6d374d7c040be3c916eb71ba6a0e42ce Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 21 Oct 2020 20:46:11 +0100
Subject: [PATCH 16/78] skip tests for pyarrow<1.0

---
 pandas/tests/extension/test_string_arrow.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
index 437d51060fb7f..7b3f585ce2fb5 100644
--- a/pandas/tests/extension/test_string_arrow.py
+++ b/pandas/tests/extension/test_string_arrow.py
@@ -4,9 +4,12 @@
 import pytest
 
 import pandas as pd
-from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype
 from pandas.tests.extension import base
 
+pytest.importorskip("pyarrow", minversion="1.0")
+
+from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype
+
 
 @pytest.fixture
 def dtype():

From 34bf57d259f4943db893b3ac96732a74e86a1b2c Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 2 Nov 2020 13:45:03 +0000
Subject: [PATCH 17/78] raise ImportError in constructors on pyarrow < 1.0.0.
 or not installed

---
 pandas/core/arrays/string_arrow.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 98df636f8435a..bb3432743663d 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,9 +1,10 @@
+from __future__ import annotations
+
 from collections import abc
+from distutils.version import LooseVersion
 from typing import Any, Optional, Sequence, Tuple, Type, Union
 
 import numpy as np
-import pyarrow as pa
-import pyarrow.compute as pc
 
 from pandas._libs import missing as libmissing
 from pandas._typing import ArrayLike
@@ -22,6 +23,16 @@
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import check_array_indexer
 
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+else:
+    try:
+        import pyarrow.compute as pc
+    except ImportError:
+        pass
+
 
 def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]:
     scalar = arrow_scalar.as_py()
@@ -160,6 +171,7 @@ class ArrowStringArray(ExtensionArray):
     """
 
     def __init__(self, values):
+        self._chk_pyarrow_available()
         if isinstance(values, pa.Array):
             self.data = pa.chunked_array([values])
         elif isinstance(values, pa.ChunkedArray):
@@ -167,8 +179,18 @@ def __init__(self, values):
         else:
             raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
 
+    @classmethod
+    def _chk_pyarrow_available(cls) -> None:
+        # TODO: maybe update import_optional_dependency to allow a minimum
+        # version to be specified rather than use the global minimum
+        if pa is None or LooseVersion(pa.__version__) < "1.0.0":
+            msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray."
+            raise ImportError(msg)
+
     @classmethod
     def _from_sequence(cls, scalars, dtype=None, copy=False):
+        cls._chk_pyarrow_available()
+
         # TODO(ARROW-9407): Accept pd.NA in Arrow
         scalars_corrected = [None if pd.isna(x) else x for x in scalars]
         return cls(pa.array(scalars_corrected, type=pa.string()))

From f92241e3341872cc6b9849d46c3aa6033267a27b Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 2 Nov 2020 16:43:06 +0000
Subject: [PATCH 18/78] remove size, shape and ndim

---
 pandas/core/arrays/string_arrow.py | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index bb3432743663d..a0b719e5116e7 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -2,7 +2,7 @@
 
 from collections import abc
 from distutils.version import LooseVersion
-from typing import Any, Optional, Sequence, Tuple, Type, Union
+from typing import Any, Optional, Sequence, Type, Union
 
 import numpy as np
 
@@ -210,28 +210,6 @@ def __arrow_array__(self, type=None):
         """Convert myself to a pyarrow Array or ChunkedArray."""
         return self.data
 
-    @property
-    def size(self) -> int:
-        """
-        Return the number of elements in this array.
-
-        Returns
-        -------
-        size : int
-        """
-        return len(self.data)
-
-    @property
-    def shape(self) -> Tuple[int]:
-        """Return the shape of the data."""
-        # This may be patched by pandas to support pseudo-2D operations.
-        return (len(self.data),)
-
-    @property
-    def ndim(self) -> int:
-        """Return the number of dimensions of the underlying data."""
-        return 1
-
     def __len__(self) -> int:
         """
         Length of this array.

From c09382d303e3097d324dfb43b824f615412750fa Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 2 Nov 2020 18:03:16 +0000
Subject: [PATCH 19/78] activate all extension array tests

---
 pandas/tests/extension/test_string_arrow.py | 72 ++++++++++-----------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
index 7b3f585ce2fb5..d6c8838a55523 100644
--- a/pandas/tests/extension/test_string_arrow.py
+++ b/pandas/tests/extension/test_string_arrow.py
@@ -65,8 +65,8 @@ class TestConstructors(base.BaseConstructorsTests):
     pass
 
 
-#  class TestReshaping(base.BaseReshapingTests):
-#     pass
+class TestReshaping(base.BaseReshapingTests):
+    pass
 
 
 class TestGetitem(base.BaseGetitemTests):
@@ -77,52 +77,52 @@ class TestSetitem(base.BaseSetitemTests):
     pass
 
 
-# class TestMissing(base.BaseMissingTests):
-#     pass
+class TestMissing(base.BaseMissingTests):
+    pass
+
+
+class TestNoReduce(base.BaseNoReduceTests):
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
+        op_name = all_numeric_reductions
 
+        if op_name in ["min", "max"]:
+            return None
 
-# class TestNoReduce(base.BaseNoReduceTests):
-#     @pytest.mark.parametrize("skipna", [True, False])
-#     def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
-#         op_name = all_numeric_reductions
-#
-#         if op_name in ["min", "max"]:
-#             return None
-#
-#         s = pd.Series(data)
-#         with pytest.raises(TypeError):
-#             getattr(s, op_name)(skipna=skipna)
+        s = pd.Series(data)
+        with pytest.raises(TypeError):
+            getattr(s, op_name)(skipna=skipna)
 
 
-# class TestMethods(base.BaseMethodsTests):
-#     @pytest.mark.skip(reason="returns nullable")
-#     def test_value_counts(self, all_data, dropna):
-#         return super().test_value_counts(all_data, dropna)
+class TestMethods(base.BaseMethodsTests):
+    @pytest.mark.skip(reason="returns nullable")
+    def test_value_counts(self, all_data, dropna):
+        return super().test_value_counts(all_data, dropna)
 
 
-# class TestCasting(base.BaseCastingTests):
-#     pass
+class TestCasting(base.BaseCastingTests):
+    pass
 
 
-# class TestComparisonOps(base.BaseComparisonOpsTests):
-#     def _compare_other(self, s, data, op_name, other):
-#         result = getattr(s, op_name)(other)
-#         expected = getattr(s.astype(object), op_name)(other).astype("boolean")
-#         self.assert_series_equal(result, expected)
+class TestComparisonOps(base.BaseComparisonOpsTests):
+    def _compare_other(self, s, data, op_name, other):
+        result = getattr(s, op_name)(other)
+        expected = getattr(s.astype(object), op_name)(other).astype("boolean")
+        self.assert_series_equal(result, expected)
 
-#     def test_compare_scalar(self, data, all_compare_operators):
-#         op_name = all_compare_operators
-#         s = pd.Series(data)
-#         self._compare_other(s, data, op_name, "abc")
+    def test_compare_scalar(self, data, all_compare_operators):
+        op_name = all_compare_operators
+        s = pd.Series(data)
+        self._compare_other(s, data, op_name, "abc")
 
 
-# class TestParsing(base.BaseParsingTests):
-#     pass
+class TestParsing(base.BaseParsingTests):
+    pass
 
 
-# class TestPrinting(base.BasePrintingTests):
-#     pass
+class TestPrinting(base.BasePrintingTests):
+    pass
 
 
-# class TestGroupBy(base.BaseGroupbyTests):
-#     pass
+class TestGroupBy(base.BaseGroupbyTests):
+    pass

From bac64c10c322af2e8304b69cdc28fe091fda3300 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 3 Nov 2020 13:43:02 +0000
Subject: [PATCH 20/78] string array tests

---
 pandas/tests/arrays/string_/test_string.py | 340 +++++++++++++++------
 1 file changed, 248 insertions(+), 92 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 089bbcf4e0e3f..30fe82758313e 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -7,10 +7,46 @@
 
 import pandas as pd
 import pandas._testing as tm
+from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype
 
+skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0")
 
-def test_repr():
-    df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype="string")})
+
+@pytest.fixture(
+    params=["string", pytest.param("arrow_string", marks=skip_if_no_pyarrow)]
+)
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture
+def dtype_object(dtype):
+    if dtype == "string":
+        return pd.StringDtype
+    else:
+        return ArrowStringDtype
+
+
+@pytest.fixture(
+    params=[
+        pd.arrays.StringArray,
+        pytest.param(ArrowStringArray, marks=skip_if_no_pyarrow),
+    ]
+)
+def cls(request):
+    return request.param
+
+
+def test_repr(dtype, request):
+    if dtype == "arrow_string":
+        reason = (
+            "AssertionError: assert '      A\n0     a\n1  None\n2     b' "
+            "== '      A\n0     a\n1  <NA>\n2     b'"
+        )
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)})
     expected = "      A\n0     a\n1  <NA>\n2     b"
     assert repr(df) == expected
 
@@ -21,27 +57,36 @@ def test_repr():
     assert repr(df.A.array) == expected
 
 
-def test_none_to_nan():
-    a = pd.arrays.StringArray._from_sequence(["a", None, "b"])
+def test_none_to_nan(cls):
+    a = cls._from_sequence(["a", None, "b"])
     assert a[1] is not None
     assert a[1] is pd.NA
 
 
-def test_setitem_validates():
-    a = pd.arrays.StringArray._from_sequence(["a", "b"])
-    with pytest.raises(ValueError, match="10"):
-        a[0] = 10
+def test_setitem_validates(cls):
+    arr = cls._from_sequence(["a", "b"])
 
-    with pytest.raises(ValueError, match="strings"):
-        a[:] = np.array([1, 2])
+    if cls is pd.arrays.StringArray:
+        msg = "Cannot set non-string value '10' into a StringArray."
+    else:
+        msg = "Scalar must be NA or str"
+    with pytest.raises(ValueError, match=msg):
+        arr[0] = 10
+
+    if cls is pd.arrays.StringArray:
+        msg = "Must provide strings."
+    else:
+        msg = "Scalar must be NA or str"
+    with pytest.raises(ValueError, match=msg):
+        arr[:] = np.array([1, 2])
 
 
-def test_setitem_with_scalar_string():
+def test_setitem_with_scalar_string(dtype):
     # is_float_dtype considers some strings, like 'd', to be floats
     # which can cause issues.
-    arr = pd.array(["a", "c"], dtype="string")
+    arr = pd.array(["a", "c"], dtype=dtype)
     arr[0] = "d"
-    expected = pd.array(["d", "c"], dtype="string")
+    expected = pd.array(["d", "c"], dtype=dtype)
     tm.assert_extension_array_equal(arr, expected)
 
 
@@ -53,46 +98,64 @@ def test_setitem_with_scalar_string():
         (["a b", "a bc. de"], operator.methodcaller("capitalize")),
     ],
 )
-def test_string_methods(input, method):
-    a = pd.Series(input, dtype="string")
+def test_string_methods(input, method, dtype, request):
+    if dtype == "arrow_string":
+        reason = "AttributeError: 'ArrowStringDtype' object has no attribute 'base'"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    a = pd.Series(input, dtype=dtype)
     b = pd.Series(input, dtype="object")
     result = method(a.str)
     expected = method(b.str)
 
-    assert result.dtype.name == "string"
+    assert result.dtype.name == dtype
     tm.assert_series_equal(result.astype(object), expected)
 
 
-def test_astype_roundtrip():
+def test_astype_roundtrip(dtype):
     s = pd.Series(pd.date_range("2000", periods=12))
     s[0] = None
 
-    result = s.astype("string").astype("datetime64[ns]")
+    result = s.astype(dtype).astype("datetime64[ns]")
     tm.assert_series_equal(result, s)
 
 
-def test_add():
-    a = pd.Series(["a", "b", "c", None, None], dtype="string")
-    b = pd.Series(["x", "y", None, "z", None], dtype="string")
+def test_add(dtype, request):
+    if dtype == "arrow_string":
+        reason = (
+            "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' and "
+            "'ArrowStringArray'"
+        )
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    a = pd.Series(["a", "b", "c", None, None], dtype=dtype)
+    b = pd.Series(["x", "y", None, "z", None], dtype=dtype)
 
     result = a + b
-    expected = pd.Series(["ax", "by", None, None, None], dtype="string")
+    expected = pd.Series(["ax", "by", None, None, None], dtype=dtype)
     tm.assert_series_equal(result, expected)
 
     result = a.add(b)
     tm.assert_series_equal(result, expected)
 
     result = a.radd(b)
-    expected = pd.Series(["xa", "yb", None, None, None], dtype="string")
+    expected = pd.Series(["xa", "yb", None, None, None], dtype=dtype)
     tm.assert_series_equal(result, expected)
 
     result = a.add(b, fill_value="-")
-    expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string")
+    expected = pd.Series(["ax", "by", "c-", "-z", None], dtype=dtype)
     tm.assert_series_equal(result, expected)
 
 
-def test_add_2d():
-    a = pd.array(["a", "b", "c"], dtype="string")
+def test_add_2d(dtype, request):
+    if dtype == "arrow_string":
+        reason = "Failed: DID NOT RAISE <class 'ValueError'>"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    a = pd.array(["a", "b", "c"], dtype=dtype)
     b = np.array([["a", "b", "c"]], dtype=object)
     with pytest.raises(ValueError, match="3 != 1"):
         a + b
@@ -102,23 +165,38 @@ def test_add_2d():
         s + b
 
 
-def test_add_sequence():
-    a = pd.array(["a", "b", None, None], dtype="string")
+def test_add_sequence(dtype, request):
+    if dtype == "arrow_string":
+        reason = (
+            "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' "
+            "and 'list'"
+        )
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    a = pd.array(["a", "b", None, None], dtype=dtype)
     other = ["x", None, "y", None]
 
     result = a + other
-    expected = pd.array(["ax", None, None, None], dtype="string")
+    expected = pd.array(["ax", None, None, None], dtype=dtype)
     tm.assert_extension_array_equal(result, expected)
 
     result = other + a
-    expected = pd.array(["xa", None, None, None], dtype="string")
+    expected = pd.array(["xa", None, None, None], dtype=dtype)
     tm.assert_extension_array_equal(result, expected)
 
 
-def test_mul():
-    a = pd.array(["a", "b", None], dtype="string")
+def test_mul(dtype, request):
+    if dtype == "arrow_string":
+        reason = (
+            "TypeError: unsupported operand type(s) for *: 'ArrowStringArray' and 'int'"
+        )
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    a = pd.array(["a", "b", None], dtype=dtype)
     result = a * 2
-    expected = pd.array(["aa", "bb", None], dtype="string")
+    expected = pd.array(["aa", "bb", None], dtype=dtype)
     tm.assert_extension_array_equal(result, expected)
 
     result = 2 * a
@@ -126,40 +204,51 @@ def test_mul():
 
 
 @pytest.mark.xfail(reason="GH-28527")
-def test_add_strings():
-    array = pd.array(["a", "b", "c", "d"], dtype="string")
+def test_add_strings(dtype):
+    array = pd.array(["a", "b", "c", "d"], dtype=dtype)
     df = pd.DataFrame([["t", "u", "v", "w"]])
     assert array.__add__(df) is NotImplemented
 
     result = array + df
-    expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("string")
+    expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype(dtype)
     tm.assert_frame_equal(result, expected)
 
     result = df + array
-    expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("string")
+    expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype(dtype)
     tm.assert_frame_equal(result, expected)
 
 
 @pytest.mark.xfail(reason="GH-28527")
-def test_add_frame():
-    array = pd.array(["a", "b", np.nan, np.nan], dtype="string")
+def test_add_frame(dtype):
+    array = pd.array(["a", "b", np.nan, np.nan], dtype=dtype)
     df = pd.DataFrame([["x", np.nan, "y", np.nan]])
 
     assert array.__add__(df) is NotImplemented
 
     result = array + df
-    expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("string")
+    expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype)
     tm.assert_frame_equal(result, expected)
 
     result = df + array
-    expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("string")
+    expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype)
     tm.assert_frame_equal(result, expected)
 
 
-def test_comparison_methods_scalar(all_compare_operators):
+def test_comparison_methods_scalar(all_compare_operators, dtype, request):
+    if dtype == "arrow_string":
+        if all_compare_operators in ["__eq__", "__ne__"]:
+            reason = (
+                "pyarrow.lib.ArrowInvalid: Could not convert <NA> with type NAType: "
+                "did not recognize Python value type when inferring an Arrow data type"
+            )
+        else:
+            reason = "AssertionError: left is not an ExtensionArray"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
     op_name = all_compare_operators
 
-    a = pd.array(["a", None, "c"], dtype="string")
+    a = pd.array(["a", None, "c"], dtype=dtype)
     other = "a"
     result = getattr(a, op_name)(other)
     expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
@@ -171,10 +260,18 @@ def test_comparison_methods_scalar(all_compare_operators):
     tm.assert_extension_array_equal(result, expected)
 
 
-def test_comparison_methods_array(all_compare_operators):
+def test_comparison_methods_array(all_compare_operators, dtype, request):
+    if dtype == "arrow_string":
+        if all_compare_operators in ["__eq__", "__ne__"]:
+            reason = "NotImplementedError: Neither scalar nor ArrowStringArray"
+        else:
+            reason = "AssertionError: left is not an ExtensionArray"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
     op_name = all_compare_operators
 
-    a = pd.array(["a", None, "c"], dtype="string")
+    a = pd.array(["a", None, "c"], dtype=dtype)
     other = [None, None, "c"]
     result = getattr(a, op_name)(other)
     expected = np.empty_like(a, dtype="object")
@@ -187,30 +284,43 @@ def test_comparison_methods_array(all_compare_operators):
     tm.assert_extension_array_equal(result, expected)
 
 
-def test_constructor_raises():
-    with pytest.raises(ValueError, match="sequence of strings"):
-        pd.arrays.StringArray(np.array(["a", "b"], dtype="S1"))
+def test_constructor_raises(cls):
+    if cls is pd.arrays.StringArray:
+        msg = "StringArray requires a sequence of strings or pandas.NA"
+    else:
+        msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowStringArray"
+
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array(["a", "b"], dtype="S1"))
 
-    with pytest.raises(ValueError, match="sequence of strings"):
-        pd.arrays.StringArray(np.array([]))
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array([]))
 
-    with pytest.raises(ValueError, match="strings or pandas.NA"):
-        pd.arrays.StringArray(np.array(["a", np.nan], dtype=object))
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array(["a", np.nan], dtype=object))
 
-    with pytest.raises(ValueError, match="strings or pandas.NA"):
-        pd.arrays.StringArray(np.array(["a", None], dtype=object))
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array(["a", None], dtype=object))
 
-    with pytest.raises(ValueError, match="strings or pandas.NA"):
-        pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object))
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array(["a", pd.NaT], dtype=object))
 
 
 @pytest.mark.parametrize("copy", [True, False])
-def test_from_sequence_no_mutate(copy):
+def test_from_sequence_no_mutate(copy, cls, request):
+    if cls is ArrowStringArray:
+        reason = (
+            "ValueError: Unsupported type '<class 'numpy.ndarray'>' for "
+            "ArrowStringArray"
+        )
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
     nan_arr = np.array(["a", np.nan], dtype=object)
     na_arr = np.array(["a", pd.NA], dtype=object)
 
-    result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy)
-    expected = pd.arrays.StringArray(na_arr)
+    result = cls._from_sequence(nan_arr, copy=copy)
+    expected = cls(na_arr)
 
     tm.assert_extension_array_equal(result, expected)
 
@@ -218,8 +328,13 @@ def test_from_sequence_no_mutate(copy):
     tm.assert_numpy_array_equal(nan_arr, expected)
 
 
-def test_astype_int():
-    arr = pd.array(["1", pd.NA, "3"], dtype="string")
+def test_astype_int(dtype, request):
+    if dtype == "arrow_string":
+        reason = "TypeError: Cannot interpret 'Int64Dtype()' as a data type"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
 
     result = arr.astype("Int64")
     expected = pd.array([1, pd.NA, 3], dtype="Int64")
@@ -228,16 +343,21 @@ def test_astype_int():
 
 @pytest.mark.parametrize("skipna", [True, False])
 @pytest.mark.xfail(reason="Not implemented StringArray.sum")
-def test_reduce(skipna):
-    arr = pd.Series(["a", "b", "c"], dtype="string")
+def test_reduce(skipna, dtype):
+    arr = pd.Series(["a", "b", "c"], dtype=dtype)
     result = arr.sum(skipna=skipna)
     assert result == "abc"
 
 
 @pytest.mark.parametrize("method", ["min", "max"])
 @pytest.mark.parametrize("skipna", [True, False])
-def test_min_max(method, skipna):
-    arr = pd.Series(["a", "b", "c", None], dtype="string")
+def test_min_max(method, skipna, dtype, request):
+    if dtype == "arrow_string":
+        reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    arr = pd.Series(["a", "b", "c", None], dtype=dtype)
     result = getattr(arr, method)(skipna=skipna)
     if skipna:
         expected = "a" if method == "min" else "c"
@@ -247,14 +367,20 @@ def test_min_max(method, skipna):
 
 
 @pytest.mark.parametrize("method", ["min", "max"])
-@pytest.mark.parametrize(
-    "arr",
-    [
-        pd.Series(["a", "b", "c", None], dtype="string"),
-        pd.array(["a", "b", "c", None], dtype="string"),
-    ],
-)
-def test_min_max_numpy(method, arr):
+@pytest.mark.parametrize("box", [pd.Series, pd.array])
+def test_min_max_numpy(method, box, dtype, request):
+    if dtype == "arrow_string":
+        if box is pd.array:
+            reason = (
+                "TypeError: '<=' not supported between instances of 'str' and "
+                "'NoneType'"
+            )
+        else:
+            reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    arr = box(["a", "b", "c", None], dtype=dtype)
     result = getattr(np, method)(arr)
     expected = "a" if method == "min" else "c"
     assert result == expected
@@ -262,8 +388,8 @@ def test_min_max_numpy(method, arr):
 
 @pytest.mark.parametrize("skipna", [True, False])
 @pytest.mark.xfail(reason="Not implemented StringArray.sum")
-def test_reduce_missing(skipna):
-    arr = pd.Series([None, "a", None, "b", "c", None], dtype="string")
+def test_reduce_missing(skipna, dtype):
+    arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype)
     result = arr.sum(skipna=skipna)
     if skipna:
         assert result == "abc"
@@ -272,34 +398,49 @@ def test_reduce_missing(skipna):
 
 
 @td.skip_if_no("pyarrow", min_version="0.15.0")
-def test_arrow_array():
+def test_arrow_array(dtype, request):
     # protocol added in 0.15.0
     import pyarrow as pa
 
-    data = pd.array(["a", "b", "c"], dtype="string")
+    if dtype == "arrow_string":
+        reason = (
+            "TypeError: Argument 'other' has incorrect type "
+            "(expected pyarrow.lib.ChunkedArray, got pyarrow.lib.StringArray)"
+        )
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    data = pd.array(["a", "b", "c"], dtype=dtype)
     arr = pa.array(data)
     expected = pa.array(list(data), type=pa.string(), from_pandas=True)
     assert arr.equals(expected)
 
 
 @td.skip_if_no("pyarrow", min_version="0.15.1.dev")
-def test_arrow_roundtrip():
+def test_arrow_roundtrip(dtype, dtype_object):
     # roundtrip possible from arrow 1.0.0
     import pyarrow as pa
 
-    data = pd.array(["a", "b", None], dtype="string")
+    data = pd.array(["a", "b", None], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
     assert table.field("a").type == "string"
     result = table.to_pandas()
-    assert isinstance(result["a"].dtype, pd.StringDtype)
+    assert isinstance(result["a"].dtype, dtype_object)
     tm.assert_frame_equal(result, df)
     # ensure the missing value is represented by NA and not np.nan or None
     assert result.loc[2, "a"] is pd.NA
 
 
-def test_value_counts_na():
-    arr = pd.array(["a", "b", "a", pd.NA], dtype="string")
+def test_value_counts_na(dtype, request):
+    if dtype == "arrow_string":
+        reason = (
+            "AttributeError: 'ArrowStringArray' object has no attribute 'value_counts'"
+        )
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
     result = arr.value_counts(dropna=False)
     expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64")
     tm.assert_series_equal(result, expected)
@@ -312,12 +453,13 @@ def test_value_counts_na():
 @pytest.mark.parametrize(
     "values, expected",
     [
-        (pd.array(["a", "b", "c"]), np.array([False, False, False])),
-        (pd.array(["a", "b", None]), np.array([False, False, True])),
+        (["a", "b", "c"], np.array([False, False, False])),
+        (["a", "b", None], np.array([False, False, True])),
     ],
 )
-def test_use_inf_as_na(values, expected):
+def test_use_inf_as_na(values, expected, dtype):
     # https://github.com/pandas-dev/pandas/issues/33655
+    values = pd.array(values, dtype=dtype)
     with pd.option_context("mode.use_inf_as_na", True):
         result = values.isna()
         tm.assert_numpy_array_equal(result, expected)
@@ -331,17 +473,31 @@ def test_use_inf_as_na(values, expected):
         tm.assert_frame_equal(result, expected)
 
 
-def test_memory_usage():
+def test_memory_usage(dtype, request):
     # GH 33963
-    series = pd.Series(["a", "b", "c"], dtype="string")
+
+    if dtype == "arrow_string":
+        reason = "assert 147 < 147"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    series = pd.Series(["a", "b", "c"], dtype=dtype)
 
     assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True)
 
 
-@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
-def test_astype_from_float_dtype(dtype):
+@pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64])
+def test_astype_from_float_dtype(float_dtype, dtype, request):
     # https://github.com/pandas-dev/pandas/issues/36451
-    s = pd.Series([0.1], dtype=dtype)
-    result = s.astype("string")
-    expected = pd.Series(["0.1"], dtype="string")
+
+    if dtype == "arrow_string":
+        reason = (
+            "pyarrow.lib.ArrowTypeError: Expected bytes, got a 'numpy.float64' object"
+        )
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    s = pd.Series([0.1], dtype=float_dtype)
+    result = s.astype(dtype)
+    expected = pd.Series(["0.1"], dtype=dtype)
     tm.assert_series_equal(result, expected)

From 0956147a8e2a3d1f459f88a753e319f9c34206a0 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 3 Nov 2020 14:43:28 +0000
Subject: [PATCH 21/78] Update pandas/core/arrays/string_arrow.py

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/core/arrays/string_arrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index a0b719e5116e7..276f4e29ca946 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -202,7 +202,7 @@ def dtype(self) -> ArrowStringDtype:
         """
         return ArrowStringDtype()
 
-    def __array__(self, *args, **kwargs) -> "np.ndarray":
+    def __array__(self, dtype=None) -> np.ndarray:
         """Correctly construct numpy arrays when passed to `np.asarray()`."""
         return self.data.__array__(*args, **kwargs)
 

From 963e1cf2b82e5c49ee86ee3cb46fe27e461739c6 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 3 Nov 2020 15:03:20 +0000
Subject: [PATCH 22/78] add a to_numpy() method and use from __array__

---
 pandas/core/arrays/string_arrow.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 276f4e29ca946..52f9f523cda2a 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 
-from pandas._libs import missing as libmissing
+from pandas._libs import lib, missing as libmissing
 from pandas._typing import ArrayLike
 
 from pandas.core.dtypes.base import ExtensionDtype
@@ -204,12 +204,21 @@ def dtype(self) -> ArrowStringDtype:
 
     def __array__(self, dtype=None) -> np.ndarray:
         """Correctly construct numpy arrays when passed to `np.asarray()`."""
-        return self.data.__array__(*args, **kwargs)
+        return self.to_numpy(dtype=dtype)
 
     def __arrow_array__(self, type=None):
         """Convert myself to a pyarrow Array or ChunkedArray."""
         return self.data
 
+    def to_numpy(
+        self, dtype=None, copy: bool = False, na_value=lib.no_default
+    ) -> np.ndarray:
+        """
+        Convert to a NumPy ndarray.
+        """
+        # TODO: copy and na_value arguments are ignored
+        return self.data.__array__(dtype=dtype)
+
     def __len__(self) -> int:
         """
         Length of this array.

From 87b8e679374456979d6a74683f8c1641532000d3 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 3 Nov 2020 16:00:44 +0000
Subject: [PATCH 23/78] mypy fixup

---
 pandas/tests/arrays/string_/test_string.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 30fe82758313e..8f2c1171deac1 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -13,7 +13,15 @@
 
 
 @pytest.fixture(
-    params=["string", pytest.param("arrow_string", marks=skip_if_no_pyarrow)]
+    params=[
+        # pandas\tests\arrays\string_\test_string.py:16: error: List item 1 has
+        # incompatible type "ParameterSet"; expected
+        # "Sequence[Collection[object]]"  [list-item]
+        "string",
+        pytest.param(
+            "arrow_string", marks=skip_if_no_pyarrow
+        ),  # type:ignore[list-item]
+    ]
 )
 def dtype(request):
     return request.param

From 1ed0585e79449ba524a5a64cd1abef23d212983c Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 3 Nov 2020 16:32:00 +0000
Subject: [PATCH 24/78] remove workaround for ARROW-9407 and ci test on
 pyarrow=1.0.0

---
 ci/deps/azure-38-locale.yaml       | 2 +-
 pandas/core/arrays/string_arrow.py | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml
index 8ce58e07a8542..f879111a32e67 100644
--- a/ci/deps/azure-38-locale.yaml
+++ b/ci/deps/azure-38-locale.yaml
@@ -34,7 +34,7 @@ dependencies:
   - xlsxwriter
   - xlwt
   - moto
-  - pyarrow>=0.15
+  - pyarrow=1.0.0
   - pip
   - pip:
     - pyxlsb
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 52f9f523cda2a..719dc39bd3515 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -190,10 +190,7 @@ def _chk_pyarrow_available(cls) -> None:
     @classmethod
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         cls._chk_pyarrow_available()
-
-        # TODO(ARROW-9407): Accept pd.NA in Arrow
-        scalars_corrected = [None if pd.isna(x) else x for x in scalars]
-        return cls(pa.array(scalars_corrected, type=pa.string()))
+        return cls(pa.array(scalars, type=pa.string(), from_pandas=True))
 
     @property
     def dtype(self) -> ArrowStringDtype:

From 82b84bfbe1de64b57af3aeaf476660199517c896 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 4 Nov 2020 12:46:36 +0000
Subject: [PATCH 25/78] add _dtype class attribute

---
 pandas/core/arrays/string_arrow.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 719dc39bd3515..ba04bff2efa8c 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -170,6 +170,8 @@ class ArrowStringArray(ExtensionArray):
     Length: 4, dtype: arrow_string
     """
 
+    _dtype = ArrowStringDtype()
+
     def __init__(self, values):
         self._chk_pyarrow_available()
         if isinstance(values, pa.Array):
@@ -197,7 +199,7 @@ def dtype(self) -> ArrowStringDtype:
         """
         An instance of 'ArrowStringDtype'.
         """
-        return ArrowStringDtype()
+        return self._dtype
 
     def __array__(self, dtype=None) -> np.ndarray:
         """Correctly construct numpy arrays when passed to `np.asarray()`."""

From b1a3032e250b6df35bfa4578fbeb5397b2cdebf3 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 4 Nov 2020 13:16:11 +0000
Subject: [PATCH 26/78] remove redundant integer indexing OOB and negative
 indexing checks in __getitem__

---
 pandas/core/arrays/string_arrow.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index ba04bff2efa8c..91938172b540f 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -272,11 +272,6 @@ def __getitem__(self, item: Any) -> Any:
                     "Only integers, slices and integer or "
                     "boolean arrays are valid indices."
                 )
-        elif is_integer(item):
-            if item < 0:
-                item += len(self)
-            if item >= len(self):
-                raise IndexError("index out of bounds")
 
         value = self.data[item]
         if isinstance(value, pa.ChunkedArray):

From 08d34f406433d50fabef3d62c340369b2a724a5f Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 4 Nov 2020 15:56:03 +0000
Subject: [PATCH 27/78] check pyarrow array is string type in constructor

---
 pandas/core/arrays/string_arrow.py         |  5 +++++
 pandas/tests/arrays/string_/test_string.py | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 91938172b540f..6d7abd3d0b156 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -181,6 +181,11 @@ def __init__(self, values):
         else:
             raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
 
+        if not pa.types.is_string(self.data.type):
+            raise ValueError(
+                "ArrowStringArray requires an array of strings or pandas.NA"
+            )
+
     @classmethod
     def _chk_pyarrow_available(cls) -> None:
         # TODO: maybe update import_optional_dependency to allow a minimum
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 8f2c1171deac1..6f500632f9030 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -314,6 +314,22 @@ def test_constructor_raises(cls):
         cls(np.array(["a", pd.NaT], dtype=object))
 
 
+@td.skip_if_no("pyarrow", min_version="1.0.0")
+def test_constructor_pyarrow_not_string_raises(cls):
+    import pyarrow as pa
+
+    if cls is pd.arrays.StringArray:
+        msg = "'values' must be a NumPy array"
+    else:
+        msg = "ArrowStringArray requires an array of strings or pandas.NA"
+
+    with pytest.raises(ValueError, match=msg):
+        cls(pa.array([1, 2, 3]))
+
+    with pytest.raises(ValueError, match=msg):
+        cls(pa.chunked_array(pa.array([1, 2, 3])))
+
+
 @pytest.mark.parametrize("copy", [True, False])
 def test_from_sequence_no_mutate(copy, cls, request):
     if cls is ArrowStringArray:

From ae49807ed406b622667a1896177dc9e560d074f2 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 4 Nov 2020 21:24:51 +0000
Subject: [PATCH 28/78] basic _from_factorized pending discussion on performant
 factorisation

---
 pandas/core/arrays/string_arrow.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 6d7abd3d0b156..4b58356f1c5a9 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -237,6 +237,10 @@ def __len__(self) -> int:
     def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
         return cls._from_sequence(strings, dtype=dtype, copy=copy)
 
+    @classmethod
+    def _from_factorized(cls, values, original):
+        return cls._from_sequence(values)
+
     def __getitem__(self, item: Any) -> Any:
         """Select a subset of self.
 

From 2e5d4c746361f64847458a0e979cdb83536c26ea Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 4 Nov 2020 21:47:43 +0000
Subject: [PATCH 29/78] update constructor error message and move test

---
 pandas/core/arrays/string_arrow.py            |  2 +-
 pandas/tests/arrays/string_/test_string.py    | 16 ----------------
 .../tests/arrays/string_/test_string_arrow.py | 19 +++++++++++++++++++
 3 files changed, 20 insertions(+), 17 deletions(-)
 create mode 100644 pandas/tests/arrays/string_/test_string_arrow.py

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 4b58356f1c5a9..9192e044a7c77 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -183,7 +183,7 @@ def __init__(self, values):
 
         if not pa.types.is_string(self.data.type):
             raise ValueError(
-                "ArrowStringArray requires an array of strings or pandas.NA"
+                "ArrowStringArray requires a PyArrow (chunked) array of string type"
             )
 
     @classmethod
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 6f500632f9030..8f2c1171deac1 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -314,22 +314,6 @@ def test_constructor_raises(cls):
         cls(np.array(["a", pd.NaT], dtype=object))
 
 
-@td.skip_if_no("pyarrow", min_version="1.0.0")
-def test_constructor_pyarrow_not_string_raises(cls):
-    import pyarrow as pa
-
-    if cls is pd.arrays.StringArray:
-        msg = "'values' must be a NumPy array"
-    else:
-        msg = "ArrowStringArray requires an array of strings or pandas.NA"
-
-    with pytest.raises(ValueError, match=msg):
-        cls(pa.array([1, 2, 3]))
-
-    with pytest.raises(ValueError, match=msg):
-        cls(pa.chunked_array(pa.array([1, 2, 3])))
-
-
 @pytest.mark.parametrize("copy", [True, False])
 def test_from_sequence_no_mutate(copy, cls, request):
     if cls is ArrowStringArray:
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
new file mode 100644
index 0000000000000..c0589cc96a95f
--- /dev/null
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -0,0 +1,19 @@
+import re
+
+import pytest
+
+from pandas.core.arrays.string_arrow import ArrowStringArray
+
+pa = pytest.importorskip("pyarrow", minversion="1.0.0")
+
+
+@pytest.mark.parametrize("chunked", [True, False])
+def test_constructor_not_string_type_raises(chunked):
+    arr = pa.array([1, 2, 3])
+    if chunked:
+        arr = pa.chunked_array(arr)
+    msg = re.escape(
+        "ArrowStringArray requires a PyArrow (chunked) array of string type"
+    )
+    with pytest.raises(ValueError, match=msg):
+        ArrowStringArray(arr)

From c8318cc5a11c63e71b929ce4f68f9a8d30eafff5 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 4 Nov 2020 22:40:53 +0000
Subject: [PATCH 30/78] add _concat_same_type classmethod

---
 pandas/core/arrays/string_arrow.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 9192e044a7c77..1b0cd0a37eb7e 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -241,6 +241,25 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
     def _from_factorized(cls, values, original):
         return cls._from_sequence(values)
 
+    @classmethod
+    def _concat_same_type(cls, to_concat) -> ArrowStringArray:
+        """
+        Concatenate multiple ArrowStringArray.
+
+        Parameters
+        ----------
+        to_concat : sequence of ArrowStringArray
+
+        Returns
+        -------
+        ArrowStringArray
+        """
+        return cls(
+            pa.chunked_array(
+                [array for ea in to_concat for array in ea.data.iterchunks()]
+            )
+        )
+
     def __getitem__(self, item: Any) -> Any:
         """Select a subset of self.
 

From 1a200a2b00dc25e57dce1a7f59fa40ab6fb1ae27 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Wed, 4 Nov 2020 22:47:11 +0000
Subject: [PATCH 31/78] _as_pandas_scalar to method

---
 pandas/core/arrays/string_arrow.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 1b0cd0a37eb7e..9780db50c9b45 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -2,7 +2,7 @@
 
 from collections import abc
 from distutils.version import LooseVersion
-from typing import Any, Optional, Sequence, Type, Union
+from typing import Any, Sequence, Type, Union
 
 import numpy as np
 
@@ -34,14 +34,6 @@
         pass
 
 
-def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]:
-    scalar = arrow_scalar.as_py()
-    if scalar is None:
-        return libmissing.NA
-    else:
-        return scalar
-
-
 @register_extension_dtype
 class ArrowStringDtype(ExtensionDtype):
     """
@@ -305,7 +297,14 @@ def __getitem__(self, item: Any) -> Any:
         if isinstance(value, pa.ChunkedArray):
             return type(self)(value)
         else:
-            return _as_pandas_scalar(value)
+            return self._as_pandas_scalar(value)
+
+    def _as_pandas_scalar(self, arrow_scalar: pa.Scalar):
+        scalar = arrow_scalar.as_py()
+        if scalar is None:
+            return self._dtype.na_value
+        else:
+            return scalar
 
     def fillna(self, value=None, method=None, limit=None):
         raise NotImplementedError("fillna")

From e10be804ec30688c76c6deac9a69768a56973ec7 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 5 Nov 2020 10:08:01 +0000
Subject: [PATCH 32/78] copy/paste fillna from fletcher as baseline (29 failed)

---
 pandas/core/arrays/string_arrow.py | 59 +++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 9780db50c9b45..e99fb58446548 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -307,7 +307,64 @@ def _as_pandas_scalar(self, arrow_scalar: pa.Scalar):
             return scalar
 
     def fillna(self, value=None, method=None, limit=None):
-        raise NotImplementedError("fillna")
+        """Fill NA/NaN values using the specified method.
+
+        Parameters
+        ----------
+        value : scalar, array-like
+            If a scalar value is passed it is used to fill all missing values.
+            Alternatively, an array-like 'value' can be given. It's expected
+            that the array-like have the same length as 'self'.
+        method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
+            Method to use for filling holes in reindexed Series
+            pad / ffill: propagate last valid observation forward to next valid
+            backfill / bfill: use NEXT valid observation to fill gap
+        limit : int, default None
+            If method is specified, this is the maximum number of consecutive
+            NaN values to forward/backward fill. In other words, if there is
+            a gap with more than this number of consecutive NaNs, it will only
+            be partially filled. If method is not specified, this is the
+            maximum number of entries along the entire axis where NaNs will be
+            filled.
+
+        Returns
+        -------
+        filled : ExtensionArray with NA/NaN filled
+        """
+        from pandas.api.types import is_array_like
+        from pandas.util._validators import validate_fillna_kwargs
+        import pandas.core.missing as pd_missing
+
+        value, method = validate_fillna_kwargs(value, method)
+
+        mask = self.isna()
+
+        if is_array_like(value):
+            if len(value) != len(self):
+                raise ValueError(
+                    "Length of 'value' does not match. Got ({}) "
+                    " expected {}".format(len(value), len(self))
+                )
+            value = value[mask]
+
+        if mask.any():
+            if method is not None:
+                # pandas 1.2+ doesn't expose pad_1d anymore
+                if not hasattr(pd_missing, "pad_1d"):
+                    func = pd_missing.get_fill_func(method)
+                else:
+                    func = (
+                        pd_missing.pad_1d if method == "pad" else pd_missing.backfill_1d
+                    )
+                new_values = func(self.astype(object), limit=limit, mask=mask)
+                new_values = self._from_sequence(new_values, self._dtype.arrow_dtype)
+            else:
+                # fill with value
+                new_values = self.copy()
+                new_values[mask] = value
+        else:
+            new_values = self.copy()
+        return new_values
 
     def _reduce(self, name, skipna=True, **kwargs):
         if name in ["min", "max"]:

From c1d308739aff23679979c6390651d407b4bdc0a2 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 5 Nov 2020 10:24:15 +0000
Subject: [PATCH 33/78] minor cleanup of fillna (29 failed)

---
 pandas/core/arrays/string_arrow.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index e99fb58446548..4d0978eee87df 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -8,6 +8,7 @@
 
 from pandas._libs import lib, missing as libmissing
 from pandas._typing import ArrayLike
+from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.dtypes import register_extension_dtype
@@ -307,7 +308,8 @@ def _as_pandas_scalar(self, arrow_scalar: pa.Scalar):
             return scalar
 
     def fillna(self, value=None, method=None, limit=None):
-        """Fill NA/NaN values using the specified method.
+        """
+        Fill NA/NaN values using the specified method.
 
         Parameters
         ----------
@@ -331,9 +333,6 @@ def fillna(self, value=None, method=None, limit=None):
         -------
         filled : ExtensionArray with NA/NaN filled
         """
-        from pandas.api.types import is_array_like
-        from pandas.util._validators import validate_fillna_kwargs
-        import pandas.core.missing as pd_missing
 
         value, method = validate_fillna_kwargs(value, method)
 
@@ -349,15 +348,9 @@ def fillna(self, value=None, method=None, limit=None):
 
         if mask.any():
             if method is not None:
-                # pandas 1.2+ doesn't expose pad_1d anymore
-                if not hasattr(pd_missing, "pad_1d"):
-                    func = pd_missing.get_fill_func(method)
-                else:
-                    func = (
-                        pd_missing.pad_1d if method == "pad" else pd_missing.backfill_1d
-                    )
+                func = libmissing.pad_1d if method == "pad" else libmissing.backfill_1d
                 new_values = func(self.astype(object), limit=limit, mask=mask)
-                new_values = self._from_sequence(new_values, self._dtype.arrow_dtype)
+                new_values = self._from_sequence(new_values)
             else:
                 # fill with value
                 new_values = self.copy()

From 34f563dcaca1bee86dee8656af95aa8a7ded834e Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 5 Nov 2020 11:21:48 +0000
Subject: [PATCH 34/78] correct mistake in previous commit (25 failed)

---
 pandas/core/arrays/string_arrow.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 4d0978eee87df..65def9efb4a9d 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -23,6 +23,7 @@
 )
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import check_array_indexer
+from pandas.core.missing import get_fill_func
 
 try:
     import pyarrow as pa
@@ -341,14 +342,14 @@ def fillna(self, value=None, method=None, limit=None):
         if is_array_like(value):
             if len(value) != len(self):
                 raise ValueError(
-                    "Length of 'value' does not match. Got ({}) "
-                    " expected {}".format(len(value), len(self))
+                    f"Length of 'value' does not match. Got ({len(value)}) "
+                    f"expected {len(self)}"
                 )
             value = value[mask]
 
         if mask.any():
             if method is not None:
-                func = libmissing.pad_1d if method == "pad" else libmissing.backfill_1d
+                func = get_fill_func(method)
                 new_values = func(self.astype(object), limit=limit, mask=mask)
                 new_values = self._from_sequence(new_values)
             else:

From f5fc4fd967fa752982b80f996c21740eccd59c3a Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 5 Nov 2020 12:15:28 +0000
Subject: [PATCH 35/78] add OpsMixin (23 failed)

---
 pandas/core/arrays/string_arrow.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 65def9efb4a9d..c48f1c62b65e9 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -7,7 +7,6 @@
 import numpy as np
 
 from pandas._libs import lib, missing as libmissing
-from pandas._typing import ArrayLike
 from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.base import ExtensionDtype
@@ -21,6 +20,7 @@
     is_integer_dtype,
     is_scalar,
 )
+from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import check_array_indexer
 from pandas.core.missing import get_fill_func
@@ -120,7 +120,7 @@ def __eq__(self, other) -> bool:
             return False
 
 
-class ArrowStringArray(ExtensionArray):
+class ArrowStringArray(OpsMixin, ExtensionArray):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
 
@@ -397,10 +397,10 @@ def copy(self) -> ExtensionArray:
         """
         return type(self)(self.data)
 
-    def __eq__(self, other: Any) -> ArrayLike:
-        """
-        Return for `self == other` (element-wise equality).
-        """
+    def _cmp_method(self, other, op):
+        if op.__name__ != "eq":
+            return NotImplemented
+
         if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
             return NotImplemented
         if isinstance(other, ArrowStringArray):

From a5a7c85b2fb12aa8d911224d3b3d972be39e245b Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 5 Nov 2020 12:41:44 +0000
Subject: [PATCH 36/78] add binops (18 failed)

---
 pandas/core/arrays/string_arrow.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index c48f1c62b65e9..7c507c037654a 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -398,15 +398,21 @@ def copy(self) -> ExtensionArray:
         return type(self)(self.data)
 
     def _cmp_method(self, other, op):
-        if op.__name__ != "eq":
-            return NotImplemented
-
+        ops = {
+            "eq": pc.equal,
+            "ne": pc.not_equal,
+            "lt": pc.less,
+            "gt": pc.greater,
+            "le": pc.less_equal,
+            "ge": pc.greater_equal,
+        }
+        op = ops[op.__name__]
         if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
             return NotImplemented
         if isinstance(other, ArrowStringArray):
-            result = pc.equal(self.data, other.data)
+            result = op(self.data, other.data)
         elif is_scalar(other):
-            result = pc.equal(self.data, pa.scalar(other))
+            result = op(self.data, pa.scalar(other))
         else:
             raise NotImplementedError("Neither scalar nor ArrowStringArray")
 

From f651563571074af5d9cd8ed2f572431e1eaf3daa Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 5 Nov 2020 13:29:45 +0000
Subject: [PATCH 37/78] return Boolean array for comparison ops (12 failed)

---
 pandas/core/arrays/string_arrow.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 7c507c037654a..b1798eda3361f 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -2,6 +2,7 @@
 
 from collections import abc
 from distutils.version import LooseVersion
+import operator
 from typing import Any, Sequence, Type, Union
 
 import numpy as np
@@ -406,15 +407,25 @@ def _cmp_method(self, other, op):
             "le": pc.less_equal,
             "ge": pc.greater_equal,
         }
-        op = ops[op.__name__]
+        pc_func = ops[op.__name__]
         if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
             return NotImplemented
         if isinstance(other, ArrowStringArray):
-            result = op(self.data, other.data)
+            result = pc_func(self.data, other.data)
         elif is_scalar(other):
-            result = op(self.data, pa.scalar(other))
+            result = pc_func(self.data, pa.scalar(other))
         else:
-            raise NotImplementedError("Neither scalar nor ArrowStringArray")
+            rops = {
+                "eq": operator.eq,
+                "ne": operator.ne,
+                "lt": operator.gt,
+                "gt": operator.lt,
+                "le": operator.ge,
+                "ge": operator.le,
+            }
+            rop = rops[op.__name__]
+            result = rop(other, self)
+            return pd.array(result, dtype="boolean")
 
         # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray
         return pd.array(result.to_pandas().values)

From f5419b92cbf9f51accf48b8167ef77cad22930ea Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 5 Nov 2020 14:13:40 +0000
Subject: [PATCH 38/78] fix ValueError: zero-size array to reduction operation
 maximum which has no identity (6 failed)

---
 pandas/core/arrays/string_arrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index b1798eda3361f..4d9a4f4a0ba6f 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -558,7 +558,7 @@ def take(
 
         if len(self.data) == 0 and (indices_array >= 0).any():
             raise IndexError("cannot do a non-empty take")
-        if indices_array.max() >= len(self.data):
+        if indices_array.size > 0 and indices_array.max() >= len(self.data):
             raise IndexError("out of bounds value in 'indices'.")
 
         if allow_fill:

From 3af5ce023310171e2c7d0153fe5b5c1c63b39ca7 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 5 Nov 2020 14:58:49 +0000
Subject: [PATCH 39/78] copy/paste value_counts from fletcher as baseline (5
 failed)

---
 pandas/core/arrays/string_arrow.py | 34 +++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 4d9a4f4a0ba6f..0122d80904dd0 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -3,7 +3,7 @@
 from collections import abc
 from distutils.version import LooseVersion
 import operator
-from typing import Any, Sequence, Type, Union
+from typing import TYPE_CHECKING, Any, Sequence, Type, Union
 
 import numpy as np
 
@@ -36,6 +36,9 @@
     except ImportError:
         pass
 
+if TYPE_CHECKING:
+    from pandas import Series
+
 
 @register_extension_dtype
 class ArrowStringDtype(ExtensionDtype):
@@ -579,3 +582,32 @@ def take(
                 indices_array = np.copy(indices_array)
                 indices_array[indices_array < 0] += len(self.data)
             return type(self)(self.data.take(indices_array))
+
+    def value_counts(self, dropna: bool = True) -> Series:
+        """
+        Return a Series containing counts of each unique value.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            Don't include counts of missing values.
+
+        Returns
+        -------
+        counts : Series
+
+        See Also
+        --------
+        Series.value_counts
+        """
+        vc = self.data.value_counts()
+
+        # Index cannot hold ExtensionArrays yet
+        index = pd.Index(type(self)(vc.field(0)).astype(object))
+        # No missings, so we can adhere to the interface and return a numpy array.
+        counts = np.array(vc.field(1))
+
+        if dropna and self.data.null_count > 0:
+            raise NotImplementedError("yo")
+
+        return pd.Series(counts, index=index)

From bdf4ad2abc302411343f06b52faa76c396b0a84f Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 5 Nov 2020 15:19:33 +0000
Subject: [PATCH 40/78] tidy imports

---
 pandas/core/arrays/string_arrow.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 0122d80904dd0..4f548252601fd 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -12,8 +12,9 @@
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.dtypes import register_extension_dtype
+from pandas.core.dtypes.generic import ABCDataFrame, ABCIndex, ABCSeries
+from pandas.core.dtypes.missing import isna
 
-import pandas as pd
 from pandas.api.types import (
     is_array_like,
     is_bool_dtype,
@@ -402,6 +403,8 @@ def copy(self) -> ExtensionArray:
         return type(self)(self.data)
 
     def _cmp_method(self, other, op):
+        from pandas.arrays import BooleanArray
+
         ops = {
             "eq": pc.equal,
             "ne": pc.not_equal,
@@ -411,7 +414,7 @@ def _cmp_method(self, other, op):
             "ge": pc.greater_equal,
         }
         pc_func = ops[op.__name__]
-        if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
+        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndex)):
             return NotImplemented
         if isinstance(other, ArrowStringArray):
             result = pc_func(self.data, other.data)
@@ -428,10 +431,10 @@ def _cmp_method(self, other, op):
             }
             rop = rops[op.__name__]
             result = rop(other, self)
-            return pd.array(result, dtype="boolean")
+            return BooleanArray._from_sequence(result)
 
         # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray
-        return pd.array(result.to_pandas().values)
+        return BooleanArray._from_sequence(result.to_pandas().values)
 
     def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
         """Set one or more values inplace.
@@ -457,9 +460,9 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
         key = check_array_indexer(self, key)
 
         if is_integer(key):
-            if not pd.api.types.is_scalar(value):
+            if not is_scalar(value):
                 raise ValueError("Must pass scalars with scalar indexer")
-            elif pd.isna(value):
+            elif isna(value):
                 value = None
             elif not isinstance(value, str):
                 raise ValueError("Scalar must be NA or str")
@@ -486,7 +489,7 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
                 # TODO(ARROW-9431): Directly support setitem(integers)
                 key_array = np.asanyarray(key)
 
-            if pd.api.types.is_scalar(value):
+            if is_scalar(value):
                 value = np.broadcast_to(value, len(key_array))
             else:
                 value = np.asarray(value)
@@ -569,7 +572,7 @@ def take(
                 # TODO(ARROW-9433): Treat negative indices as NULL
                 indices_array = pa.array(indices_array, mask=indices_array < 0)
                 result = self.data.take(indices_array)
-                if pd.isna(fill_value):
+                if isna(fill_value):
                     return type(self)(result)
                 return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
             else:
@@ -600,14 +603,16 @@ def value_counts(self, dropna: bool = True) -> Series:
         --------
         Series.value_counts
         """
+        from pandas import Index, Series
+
         vc = self.data.value_counts()
 
         # Index cannot hold ExtensionArrays yet
-        index = pd.Index(type(self)(vc.field(0)).astype(object))
+        index = Index(type(self)(vc.field(0)).astype(object))
         # No missings, so we can adhere to the interface and return a numpy array.
         counts = np.array(vc.field(1))
 
         if dropna and self.data.null_count > 0:
             raise NotImplementedError("yo")
 
-        return pd.Series(counts, index=index)
+        return Series(counts, index=index)

From e044c7f763b76ac64a5a53cd4af740cf04a21730 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 6 Nov 2020 12:34:29 +0000
Subject: [PATCH 41/78] fix test_take_non_na_fill_value (4 failed)

---
 pandas/core/arrays/string_arrow.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 4f548252601fd..bda3036c8dae4 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -574,7 +574,12 @@ def take(
                 result = self.data.take(indices_array)
                 if isna(fill_value):
                     return type(self)(result)
-                return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
+                # TODO: ArrowNotImplementedError: Function fill_null has no
+                # kernel matching input types (array[string], scalar[string])
+                result = type(self)(result)
+                result[result.isna()] = fill_value
+                return result
+                # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
             else:
                 # Nothing to fill
                 return type(self)(self.data.take(indices))

From c5625a891f41092b1f1b51b51dd2be192e1b9599 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 6 Nov 2020 12:56:50 +0000
Subject: [PATCH 42/78] fix test_take_pandas_style_negative_raises (3 failed)

---
 pandas/core/arrays/string_arrow.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index bda3036c8dae4..24d498520410b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -24,7 +24,7 @@
 )
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
-from pandas.core.indexers import check_array_indexer
+from pandas.core.indexers import check_array_indexer, validate_indices
 from pandas.core.missing import get_fill_func
 
 try:
@@ -569,6 +569,7 @@ def take(
 
         if allow_fill:
             if (indices_array < 0).any():
+                validate_indices(indices_array, len(self.data))
                 # TODO(ARROW-9433): Treat negative indices as NULL
                 indices_array = pa.array(indices_array, mask=indices_array < 0)
                 result = self.data.take(indices_array)

From 50889fbd2a519a1e61dd1e0fb114ba9477c64b02 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 6 Nov 2020 13:36:58 +0000
Subject: [PATCH 43/78] parametrize string extension tests (3 failed)

---
 pandas/tests/extension/test_string.py       |  47 ++++---
 pandas/tests/extension/test_string_arrow.py | 128 --------------------
 2 files changed, 32 insertions(+), 143 deletions(-)
 delete mode 100644 pandas/tests/extension/test_string_arrow.py

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 27a157d2127f6..18659a0e48cd5 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -3,39 +3,49 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
-from pandas.core.arrays.string_ import StringArray, StringDtype
+from pandas.core.arrays.string_ import StringDtype
+from pandas.core.arrays.string_arrow import ArrowStringDtype
 from pandas.tests.extension import base
 
 
-@pytest.fixture
-def dtype():
-    return StringDtype()
+@pytest.fixture(
+    params=[
+        StringDtype,
+        pytest.param(
+            ArrowStringDtype, marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+        ),
+    ]
+)
+def dtype(request):
+    return request.param()
 
 
 @pytest.fixture
-def data():
+def data(dtype):
     strings = np.random.choice(list(string.ascii_letters), size=100)
     while strings[0] == strings[1]:
         strings = np.random.choice(list(string.ascii_letters), size=100)
 
-    return StringArray._from_sequence(strings)
+    return dtype.construct_array_type()._from_sequence(strings)
 
 
 @pytest.fixture
-def data_missing():
+def data_missing(dtype):
     """Length 2 array with [NA, Valid]"""
-    return StringArray._from_sequence([pd.NA, "A"])
+    return dtype.construct_array_type()._from_sequence([pd.NA, "A"])
 
 
 @pytest.fixture
-def data_for_sorting():
-    return StringArray._from_sequence(["B", "C", "A"])
+def data_for_sorting(dtype):
+    return dtype.construct_array_type()._from_sequence(["B", "C", "A"])
 
 
 @pytest.fixture
-def data_missing_for_sorting():
-    return StringArray._from_sequence(["B", pd.NA, "A"])
+def data_missing_for_sorting(dtype):
+    return dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"])
 
 
 @pytest.fixture
@@ -44,8 +54,10 @@ def na_value():
 
 
 @pytest.fixture
-def data_for_grouping():
-    return StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"])
+def data_for_grouping(dtype):
+    return dtype.construct_array_type()._from_sequence(
+        ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]
+    )
 
 
 class TestDtype(base.BaseDtypeTests):
@@ -53,7 +65,12 @@ class TestDtype(base.BaseDtypeTests):
 
 
 class TestInterface(base.BaseInterfaceTests):
-    pass
+    def test_view(self, data, dtype, request):
+        if isinstance(dtype, ArrowStringDtype):
+            reason = "Fails until implement, remove before merge"
+            mark = pytest.mark.xfail(reason=reason)
+            request.node.add_marker(mark)
+        base.BaseInterfaceTests.test_view(self, data)
 
 
 class TestConstructors(base.BaseConstructorsTests):
diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
deleted file mode 100644
index d6c8838a55523..0000000000000
--- a/pandas/tests/extension/test_string_arrow.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import string
-
-import numpy as np
-import pytest
-
-import pandas as pd
-from pandas.tests.extension import base
-
-pytest.importorskip("pyarrow", minversion="1.0")
-
-from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype
-
-
-@pytest.fixture
-def dtype():
-    return ArrowStringDtype()
-
-
-@pytest.fixture
-def data():
-    strings = np.random.choice(list(string.ascii_letters), size=100)
-    while strings[0] == strings[1]:
-        strings = np.random.choice(list(string.ascii_letters), size=100)
-
-    return ArrowStringArray._from_sequence(strings)
-
-
-@pytest.fixture
-def data_missing():
-    """Length 2 array with [NA, Valid]"""
-    return ArrowStringArray._from_sequence([pd.NA, "A"])
-
-
-@pytest.fixture
-def data_for_sorting():
-    return ArrowStringArray._from_sequence(["B", "C", "A"])
-
-
-@pytest.fixture
-def data_missing_for_sorting():
-    return ArrowStringArray._from_sequence(["B", pd.NA, "A"])
-
-
-@pytest.fixture
-def na_value():
-    return pd.NA
-
-
-@pytest.fixture
-def data_for_grouping():
-    return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"])
-
-
-class TestDtype(base.BaseDtypeTests):
-    pass
-
-
-class TestInterface(base.BaseInterfaceTests):
-    @pytest.mark.xfail(reason="Fails until implement, remove before merge")
-    def test_view(self, data):
-        base.BaseInterfaceTests.test_view(self, data)
-
-
-class TestConstructors(base.BaseConstructorsTests):
-    pass
-
-
-class TestReshaping(base.BaseReshapingTests):
-    pass
-
-
-class TestGetitem(base.BaseGetitemTests):
-    pass
-
-
-class TestSetitem(base.BaseSetitemTests):
-    pass
-
-
-class TestMissing(base.BaseMissingTests):
-    pass
-
-
-class TestNoReduce(base.BaseNoReduceTests):
-    @pytest.mark.parametrize("skipna", [True, False])
-    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
-        op_name = all_numeric_reductions
-
-        if op_name in ["min", "max"]:
-            return None
-
-        s = pd.Series(data)
-        with pytest.raises(TypeError):
-            getattr(s, op_name)(skipna=skipna)
-
-
-class TestMethods(base.BaseMethodsTests):
-    @pytest.mark.skip(reason="returns nullable")
-    def test_value_counts(self, all_data, dropna):
-        return super().test_value_counts(all_data, dropna)
-
-
-class TestCasting(base.BaseCastingTests):
-    pass
-
-
-class TestComparisonOps(base.BaseComparisonOpsTests):
-    def _compare_other(self, s, data, op_name, other):
-        result = getattr(s, op_name)(other)
-        expected = getattr(s.astype(object), op_name)(other).astype("boolean")
-        self.assert_series_equal(result, expected)
-
-    def test_compare_scalar(self, data, all_compare_operators):
-        op_name = all_compare_operators
-        s = pd.Series(data)
-        self._compare_other(s, data, op_name, "abc")
-
-
-class TestParsing(base.BaseParsingTests):
-    pass
-
-
-class TestPrinting(base.BasePrintingTests):
-    pass
-
-
-class TestGroupBy(base.BaseGroupbyTests):
-    pass

From 0e1773bde82c6e1e85d8a52a53434150e1e6efbe Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 6 Nov 2020 14:19:28 +0000
Subject: [PATCH 44/78] xfail other 2 tests expecting views (1 failed)

---
 pandas/tests/extension/test_string.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 18659a0e48cd5..3f95117b238a7 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -78,7 +78,12 @@ class TestConstructors(base.BaseConstructorsTests):
 
 
 class TestReshaping(base.BaseReshapingTests):
-    pass
+    def test_transpose(self, data, dtype, request):
+        if isinstance(dtype, ArrowStringDtype):
+            reason = "Fails until implement, remove before merge"
+            mark = pytest.mark.xfail(reason=reason)
+            request.node.add_marker(mark)
+        base.BaseReshapingTests.test_transpose(self, data)
 
 
 class TestGetitem(base.BaseGetitemTests):
@@ -86,7 +91,12 @@ class TestGetitem(base.BaseGetitemTests):
 
 
 class TestSetitem(base.BaseSetitemTests):
-    pass
+    def test_setitem_preserves_views(self, data, dtype, request):
+        if isinstance(dtype, ArrowStringDtype):
+            reason = "Fails until implement, remove before merge"
+            mark = pytest.mark.xfail(reason=reason)
+            request.node.add_marker(mark)
+        base.BaseSetitemTests.test_setitem_preserves_views(self, data)
 
 
 class TestMissing(base.BaseMissingTests):

From 7bb9574792b7afedc1a3eb698e798f6a002d991b Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 6 Nov 2020 15:11:15 +0000
Subject: [PATCH 45/78] add ensure_string_array to _from_sequence (1 failed)

---
 pandas/_libs/lib.pyx                       |  2 +-
 pandas/core/arrays/string_arrow.py         |  4 ++++
 pandas/core/dtypes/cast.py                 |  6 +++++-
 pandas/tests/arrays/string_/test_string.py | 10 +---------
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 0b0334d52c1e9..6abf9c06f7289 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -634,7 +634,7 @@ cpdef ndarray[object] ensure_string_array(
     ----------
     arr : array-like
         The values to be converted to str, if needed.
-    na_value : Any
+    na_value : Any, default np.nan
         The value to use for na. For example, np.nan or pd.NA.
     convert_na_value : bool, default True
         If False, existing na values will be used unchanged in the new array.
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 24d498520410b..c19884a091411 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -196,6 +196,10 @@ def _chk_pyarrow_available(cls) -> None:
     @classmethod
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         cls._chk_pyarrow_available()
+        # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value
+        scalars = lib.ensure_string_array(
+            scalars, na_value=cls._dtype.na_value, copy=copy
+        )
         return cls(pa.array(scalars, type=pa.string(), from_pandas=True))
 
     @property
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 692da8f8e021e..c2f28b17e7227 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -360,13 +360,17 @@ def maybe_cast_to_extension_array(
     ExtensionArray or obj
     """
     from pandas.core.arrays.string_ import StringArray
+    from pandas.core.arrays.string_arrow import ArrowStringArray
 
     assert isinstance(cls, type), f"must pass a type: {cls}"
     assertion_msg = f"must pass a subclass of ExtensionArray: {cls}"
     assert issubclass(cls, ABCExtensionArray), assertion_msg
 
     # Everything can be be converted to StringArrays, but we may not want to convert
-    if issubclass(cls, StringArray) and lib.infer_dtype(obj) != "string":
+    if (
+        issubclass(cls, (StringArray, ArrowStringArray))
+        and lib.infer_dtype(obj) != "string"
+    ):
         return obj
 
     try:
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 8f2c1171deac1..250af3ab13d3a 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -495,16 +495,8 @@ def test_memory_usage(dtype, request):
 
 
 @pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64])
-def test_astype_from_float_dtype(float_dtype, dtype, request):
+def test_astype_from_float_dtype(float_dtype, dtype):
     # https://github.com/pandas-dev/pandas/issues/36451
-
-    if dtype == "arrow_string":
-        reason = (
-            "pyarrow.lib.ArrowTypeError: Expected bytes, got a 'numpy.float64' object"
-        )
-        mark = pytest.mark.xfail(reason=reason)
-        request.node.add_marker(mark)
-
     s = pd.Series([0.1], dtype=float_dtype)
     result = s.astype(dtype)
     expected = pd.Series(["0.1"], dtype=dtype)

From 51d7d0a9fc1786732622ab684dbb8666bb1026c1 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 12 Nov 2020 15:44:56 +0000
Subject: [PATCH 46/78] Apply suggestions from code review

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/core/arrays/string_arrow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index c19884a091411..4d8879a781c48 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -198,7 +198,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         cls._chk_pyarrow_available()
         # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value
         scalars = lib.ensure_string_array(
-            scalars, na_value=cls._dtype.na_value, copy=copy
+            scalars, na_value=cls._dtype.na_value, copy=False
         )
         return cls(pa.array(scalars, type=pa.string(), from_pandas=True))
 
@@ -625,4 +625,4 @@ def value_counts(self, dropna: bool = True) -> Series:
         if dropna and self.data.null_count > 0:
             raise NotImplementedError("yo")
 
-        return Series(counts, index=index)
+        return Series(counts, index=index).astype("Int64")

From 3cf5c9183561046f6842f7e8cc3d82a9afd41622 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 12 Nov 2020 16:17:28 +0000
Subject: [PATCH 47/78] return NotImplemented in comparisons (7 failed)

---
 pandas/core/arrays/string_arrow.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 4d8879a781c48..d733612310ae7 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -2,7 +2,6 @@
 
 from collections import abc
 from distutils.version import LooseVersion
-import operator
 from typing import TYPE_CHECKING, Any, Sequence, Type, Union
 
 import numpy as np
@@ -425,17 +424,7 @@ def _cmp_method(self, other, op):
         elif is_scalar(other):
             result = pc_func(self.data, pa.scalar(other))
         else:
-            rops = {
-                "eq": operator.eq,
-                "ne": operator.ne,
-                "lt": operator.gt,
-                "gt": operator.lt,
-                "le": operator.ge,
-                "ge": operator.le,
-            }
-            rop = rops[op.__name__]
-            result = rop(other, self)
-            return BooleanArray._from_sequence(result)
+            return NotImplemented
 
         # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray
         return BooleanArray._from_sequence(result.to_pandas().values)

From 07239a05ccd651d71290726a420b98f47ff178fe Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 12 Nov 2020 16:54:08 +0000
Subject: [PATCH 48/78] move arrow function lookup dict to module scope (7
 failed)

---
 pandas/core/arrays/string_arrow.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index d733612310ae7..b453d0cbb6863 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -35,6 +35,16 @@
         import pyarrow.compute as pc
     except ImportError:
         pass
+    else:
+        ARROW_CMP_FUNCS = {
+            "eq": pc.equal,
+            "ne": pc.not_equal,
+            "lt": pc.less,
+            "gt": pc.greater,
+            "le": pc.less_equal,
+            "ge": pc.greater_equal,
+        }
+
 
 if TYPE_CHECKING:
     from pandas import Series
@@ -408,15 +418,7 @@ def copy(self) -> ExtensionArray:
     def _cmp_method(self, other, op):
         from pandas.arrays import BooleanArray
 
-        ops = {
-            "eq": pc.equal,
-            "ne": pc.not_equal,
-            "lt": pc.less,
-            "gt": pc.greater,
-            "le": pc.less_equal,
-            "ge": pc.greater_equal,
-        }
-        pc_func = ops[op.__name__]
+        pc_func = ARROW_CMP_FUNCS[op.__name__]
         if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndex)):
             return NotImplemented
         if isinstance(other, ArrowStringArray):

From 9a7cfc5b21ff5515eda7110ea56aa36802af1ecd Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 12 Nov 2020 19:15:48 +0000
Subject: [PATCH 49/78] remove isinstance(other, (ABCSeries, ABCDataFrame,
 ABCIndex)) check

---
 pandas/core/arrays/string_arrow.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index b453d0cbb6863..bd05df48e1226 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -11,7 +11,6 @@
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.dtypes import register_extension_dtype
-from pandas.core.dtypes.generic import ABCDataFrame, ABCIndex, ABCSeries
 from pandas.core.dtypes.missing import isna
 
 from pandas.api.types import (
@@ -419,8 +418,6 @@ def _cmp_method(self, other, op):
         from pandas.arrays import BooleanArray
 
         pc_func = ARROW_CMP_FUNCS[op.__name__]
-        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndex)):
-            return NotImplemented
         if isinstance(other, ArrowStringArray):
             result = pc_func(self.data, other.data)
         elif is_scalar(other):

From 2ba0dcddfe675dbadf7b4cc19d9180bd31b1e89e Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 15:12:19 +0000
Subject: [PATCH 50/78] remove na_value=cls._dtype.na_value from
 ensure_string_array call (7 failed)

---
 pandas/core/arrays/string_arrow.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index bd05df48e1226..121984b548074 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -205,9 +205,7 @@ def _chk_pyarrow_available(cls) -> None:
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         cls._chk_pyarrow_available()
         # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value
-        scalars = lib.ensure_string_array(
-            scalars, na_value=cls._dtype.na_value, copy=False
-        )
+        scalars = lib.ensure_string_array(scalars, copy=False)
         return cls(pa.array(scalars, type=pa.string(), from_pandas=True))
 
     @property

From 97c56e28a29b3ef11818613477e0a5b5ee36da38 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 15:19:26 +0000
Subject: [PATCH 51/78] coloate _from_sequence_of_strings with _from_sequence
 (7 failed)

---
 pandas/core/arrays/string_arrow.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 121984b548074..d100489800401 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -208,6 +208,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         scalars = lib.ensure_string_array(scalars, copy=False)
         return cls(pa.array(scalars, type=pa.string(), from_pandas=True))
 
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        return cls._from_sequence(strings, dtype=dtype, copy=copy)
+
     @property
     def dtype(self) -> ArrowStringDtype:
         """
@@ -242,10 +246,6 @@ def __len__(self) -> int:
         """
         return len(self.data)
 
-    @classmethod
-    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
-        return cls._from_sequence(strings, dtype=dtype, copy=copy)
-
     @classmethod
     def _from_factorized(cls, values, original):
         return cls._from_sequence(values)

From d6d3543bd048a6bcb3e0bc4f918cc44e23884207 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 15:21:43 +0000
Subject: [PATCH 52/78] revert change to extra_compile_args in setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 177cf4104133e..9a9d12ce4d2ba 100755
--- a/setup.py
+++ b/setup.py
@@ -432,7 +432,7 @@ def run(self):
         extra_compile_args.append("/Z7")
         extra_link_args.append("/DEBUG")
 else:
-    extra_compile_args = []
+    extra_compile_args = ["-Werror"]
     extra_link_args = []
     if debugging_symbols_requested:
         extra_compile_args.append("-g")

From d71a895ae59a49e9ddf38c679697e1fc6f3f8f11 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 15:38:08 +0000
Subject: [PATCH 53/78] sync fillna docstring with base

---
 pandas/core/arrays/string_arrow.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index d100489800401..379568c18fd3a 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -336,7 +336,7 @@ def fillna(self, value=None, method=None, limit=None):
         method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
             Method to use for filling holes in reindexed Series
             pad / ffill: propagate last valid observation forward to next valid
-            backfill / bfill: use NEXT valid observation to fill gap
+            backfill / bfill: use NEXT valid observation to fill gap.
         limit : int, default None
             If method is specified, this is the maximum number of consecutive
             NaN values to forward/backward fill. In other words, if there is
@@ -347,9 +347,9 @@ def fillna(self, value=None, method=None, limit=None):
 
         Returns
         -------
-        filled : ExtensionArray with NA/NaN filled
+        ExtensionArray
+            With NA/NaN filled.
         """
-
         value, method = validate_fillna_kwargs(value, method)
 
         mask = self.isna()

From f342b62716e735ddc1bb1fddd56fc3da0eb40eb5 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 15:43:29 +0000
Subject: [PATCH 54/78] Apply suggestions from code review

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/core/arrays/string_arrow.py    | 2 +-
 pandas/tests/extension/test_string.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index b453d0cbb6863..b0efb73e48c88 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -368,7 +368,7 @@ def fillna(self, value=None, method=None, limit=None):
         if mask.any():
             if method is not None:
                 func = get_fill_func(method)
-                new_values = func(self.astype(object), limit=limit, mask=mask)
+                new_values = func(self.to_numpy(object), limit=limit, mask=mask)
                 new_values = self._from_sequence(new_values)
             else:
                 # fill with value
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 3f95117b238a7..3653ddf846510 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -65,12 +65,12 @@ class TestDtype(base.BaseDtypeTests):
 
 
 class TestInterface(base.BaseInterfaceTests):
-    def test_view(self, data, dtype, request):
-        if isinstance(dtype, ArrowStringDtype):
+    def test_view(self, data, request):
+        if isinstance(data.dtype, ArrowStringDtype):
             reason = "Fails until implement, remove before merge"
             mark = pytest.mark.xfail(reason=reason)
             request.node.add_marker(mark)
-        base.BaseInterfaceTests.test_view(self, data)
+        super().test_view(self, data)
 
 
 class TestConstructors(base.BaseConstructorsTests):

From b3c63479ea0c9eedd6af58c93453a7769fffcd33 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 15:51:11 +0000
Subject: [PATCH 55/78] other base.Base*Tests -> super()

---
 pandas/tests/extension/test_string.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 3653ddf846510..2e820940716a8 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -70,7 +70,7 @@ def test_view(self, data, request):
             reason = "Fails until implement, remove before merge"
             mark = pytest.mark.xfail(reason=reason)
             request.node.add_marker(mark)
-        super().test_view(self, data)
+        super().test_view(data)
 
 
 class TestConstructors(base.BaseConstructorsTests):
@@ -83,7 +83,7 @@ def test_transpose(self, data, dtype, request):
             reason = "Fails until implement, remove before merge"
             mark = pytest.mark.xfail(reason=reason)
             request.node.add_marker(mark)
-        base.BaseReshapingTests.test_transpose(self, data)
+        super().test_transpose(data)
 
 
 class TestGetitem(base.BaseGetitemTests):
@@ -96,7 +96,7 @@ def test_setitem_preserves_views(self, data, dtype, request):
             reason = "Fails until implement, remove before merge"
             mark = pytest.mark.xfail(reason=reason)
             request.node.add_marker(mark)
-        base.BaseSetitemTests.test_setitem_preserves_views(self, data)
+        super().test_setitem_preserves_views(data)
 
 
 class TestMissing(base.BaseMissingTests):

From 26bca2535133ad5c0a634ce2fc9744e4f4281e28 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 16:24:22 +0000
Subject: [PATCH 56/78] len(item) == 0 -> not len(item)

---
 pandas/core/arrays/string_arrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 1fb387a053e5e..9a22c7c1eae9b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -298,7 +298,7 @@ def __getitem__(self, item: Any) -> Any:
         if isinstance(item, abc.Iterable):
             if not is_array_like(item):
                 item = np.array(item)
-            if len(item) == 0:
+            if not len(item):
                 return type(self)(pa.chunked_array([], type=pa.string()))
             elif is_integer_dtype(item):
                 return self.take(item)

From 9579444451c44548a6f0b2970d443eb82901996e Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 16:50:56 +0000
Subject: [PATCH 57/78] update copy docstring and return type

---
 pandas/core/arrays/string_arrow.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 9a22c7c1eae9b..23e5bc58edb86 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -397,18 +397,13 @@ def isna(self) -> np.ndarray:
         # TODO: Implement .to_numpy for ChunkedArray
         return self.data.is_null().to_pandas().values
 
-    def copy(self) -> ExtensionArray:
+    def copy(self) -> ArrowStringArray:
         """
-        Return a copy of the array.
-
-        Parameters
-        ----------
-        deep : bool, default False
-            Also copy the underlying data backing this array.
+        Return a shallow copy of the array.
 
         Returns
         -------
-        ExtensionArray
+        ArrowStringArray
         """
         return type(self)(self.data)
 

From 88094a7726a8f6eb520796cc27d6e743e0b6dd34 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 18:56:20 +0000
Subject: [PATCH 58/78] test_constructor_not_string_type_raises with np.ndarray

---
 .../tests/arrays/string_/test_string_arrow.py   | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index c0589cc96a95f..ec7f57940a67f 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -1,5 +1,6 @@
 import re
 
+import numpy as np
 import pytest
 
 from pandas.core.arrays.string_arrow import ArrowStringArray
@@ -8,12 +9,18 @@
 
 
 @pytest.mark.parametrize("chunked", [True, False])
-def test_constructor_not_string_type_raises(chunked):
-    arr = pa.array([1, 2, 3])
+@pytest.mark.parametrize("array", [np, pa])
+def test_constructor_not_string_type_raises(array, chunked):
+    arr = array.array([1, 2, 3])
     if chunked:
+        if array is np:
+            pytest.skip("chunked not applicable to numpy array")
         arr = pa.chunked_array(arr)
-    msg = re.escape(
-        "ArrowStringArray requires a PyArrow (chunked) array of string type"
-    )
+    if array is np:
+        msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowStringArray"
+    else:
+        msg = re.escape(
+            "ArrowStringArray requires a PyArrow (chunked) array of string type"
+        )
     with pytest.raises(ValueError, match=msg):
         ArrowStringArray(arr)

From ba0cee8fb9a202ce4a4fd45cd2ea4814f42fde14 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 19:41:43 +0000
Subject: [PATCH 59/78] update test_from_sequence_no_mutate (7 failed)

---
 pandas/tests/arrays/string_/test_string.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 250af3ab13d3a..cd9bd404043a3 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -316,11 +316,8 @@ def test_constructor_raises(cls):
 
 @pytest.mark.parametrize("copy", [True, False])
 def test_from_sequence_no_mutate(copy, cls, request):
-    if cls is ArrowStringArray:
-        reason = (
-            "ValueError: Unsupported type '<class 'numpy.ndarray'>' for "
-            "ArrowStringArray"
-        )
+    if cls is ArrowStringArray and copy is False:
+        reason = "AssertionError: numpy array are different"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
 
@@ -328,7 +325,13 @@ def test_from_sequence_no_mutate(copy, cls, request):
     na_arr = np.array(["a", pd.NA], dtype=object)
 
     result = cls._from_sequence(nan_arr, copy=copy)
-    expected = cls(na_arr)
+
+    if cls is ArrowStringArray:
+        import pyarrow as pa
+
+        expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True))
+    else:
+        expected = cls(na_arr)
 
     tm.assert_extension_array_equal(result, expected)
 

From 6709ac3c198ff48ffbcdf85b0861272b2b87de02 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 20:24:56 +0000
Subject: [PATCH 60/78] change xfail message for base extension array tests (7
 failed)

---
 pandas/tests/extension/test_string.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 2e820940716a8..db1940226e04e 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -67,8 +67,7 @@ class TestDtype(base.BaseDtypeTests):
 class TestInterface(base.BaseInterfaceTests):
     def test_view(self, data, request):
         if isinstance(data.dtype, ArrowStringDtype):
-            reason = "Fails until implement, remove before merge"
-            mark = pytest.mark.xfail(reason=reason)
+            mark = pytest.mark.xfail(reason="not implemented")
             request.node.add_marker(mark)
         super().test_view(data)
 
@@ -80,8 +79,7 @@ class TestConstructors(base.BaseConstructorsTests):
 class TestReshaping(base.BaseReshapingTests):
     def test_transpose(self, data, dtype, request):
         if isinstance(dtype, ArrowStringDtype):
-            reason = "Fails until implement, remove before merge"
-            mark = pytest.mark.xfail(reason=reason)
+            mark = pytest.mark.xfail(reason="not implemented")
             request.node.add_marker(mark)
         super().test_transpose(data)
 
@@ -93,8 +91,7 @@ class TestGetitem(base.BaseGetitemTests):
 class TestSetitem(base.BaseSetitemTests):
     def test_setitem_preserves_views(self, data, dtype, request):
         if isinstance(dtype, ArrowStringDtype):
-            reason = "Fails until implement, remove before merge"
-            mark = pytest.mark.xfail(reason=reason)
+            mark = pytest.mark.xfail(reason="not implemented")
             request.node.add_marker(mark)
         super().test_setitem_preserves_views(data)
 

From 11388b4e84494dd97e4f88cfe6273c40a270cdd5 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 20:37:28 +0000
Subject: [PATCH 61/78] change xfail reason message in test_value_counts_na

---
 pandas/tests/arrays/string_/test_string.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index cd9bd404043a3..8552b83568cf2 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -445,9 +445,7 @@ def test_arrow_roundtrip(dtype, dtype_object):
 
 def test_value_counts_na(dtype, request):
     if dtype == "arrow_string":
-        reason = (
-            "AttributeError: 'ArrowStringArray' object has no attribute 'value_counts'"
-        )
+        reason = "TypeError: boolean value of NA is ambiguous"
         mark = pytest.mark.xfail(reason=reason)
         request.node.add_marker(mark)
 

From eb284e767059cbcb2d9ba2f50d33ec13ed65400d Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 13 Nov 2020 20:45:43 +0000
Subject: [PATCH 62/78] skip test_memory_usage for ArrowStringArray

---
 pandas/tests/arrays/string_/test_string.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 8552b83568cf2..210b3791d431e 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -486,9 +486,7 @@ def test_memory_usage(dtype, request):
     # GH 33963
 
     if dtype == "arrow_string":
-        reason = "assert 147 < 147"
-        mark = pytest.mark.xfail(reason=reason)
-        request.node.add_marker(mark)
+        pytest.skip("not applicable")
 
     series = pd.Series(["a", "b", "c"], dtype=dtype)
 

From 9b7070923224daf065ff6ac47c558700f88eec4d Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 14 Nov 2020 12:42:05 +0000
Subject: [PATCH 63/78] part implementation of na_value in to_numpy

---
 pandas/core/arrays/string_arrow.py         |  9 +++++++--
 pandas/tests/arrays/string_/test_string.py | 15 +++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 23e5bc58edb86..58e076df4ac6b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -233,8 +233,13 @@ def to_numpy(
         """
         Convert to a NumPy ndarray.
         """
-        # TODO: copy and na_value arguments are ignored
-        return self.data.__array__(dtype=dtype)
+        # TODO: copy argument is ignored
+
+        if na_value is lib.no_default:
+            na_value = self._dtype.na_value
+        result = self.data.__array__(dtype=dtype)
+        result[isna(result)] = na_value
+        return result
 
     def __len__(self) -> int:
         """
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 210b3791d431e..fa1580db7fe64 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -500,3 +500,18 @@ def test_astype_from_float_dtype(float_dtype, dtype):
     result = s.astype(dtype)
     expected = pd.Series(["0.1"], dtype=dtype)
     tm.assert_series_equal(result, expected)
+
+
+def test_to_numpy_returns_pdna_default(dtype):
+    arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
+    result = np.array(arr)
+    expected = np.array(["a", pd.NA, "b"], dtype=object)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_to_numpy_na_value(dtype, nulls_fixture):
+    na_value = nulls_fixture
+    arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
+    result = arr.to_numpy(na_value=na_value)
+    expected = np.array(["a", na_value, "b"], dtype=object)
+    tm.assert_numpy_array_equal(result, expected)

From 6757feb97071a487400fb9bc9ba44b43daa95c03 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 14 Nov 2020 13:37:24 +0000
Subject: [PATCH 64/78] remove is_array_like in __getitem__

---
 pandas/core/arrays/string_arrow.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 58e076df4ac6b..b76af7f886e60 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-from collections import abc
 from distutils.version import LooseVersion
 from typing import TYPE_CHECKING, Any, Sequence, Type, Union
 
@@ -10,16 +9,17 @@
 from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.base import ExtensionDtype
-from pandas.core.dtypes.dtypes import register_extension_dtype
-from pandas.core.dtypes.missing import isna
-
-from pandas.api.types import (
+from pandas.core.dtypes.common import (
     is_array_like,
     is_bool_dtype,
     is_integer,
     is_integer_dtype,
+    is_list_like,
     is_scalar,
 )
+from pandas.core.dtypes.dtypes import register_extension_dtype
+from pandas.core.dtypes.missing import isna
+
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import check_array_indexer, validate_indices
@@ -298,11 +298,9 @@ def __getitem__(self, item: Any) -> Any:
         For a boolean mask, return an instance of ``ExtensionArray``, filtered
         to the values where ``item`` is True.
         """
-        item = check_array_indexer(self, item)
 
-        if isinstance(item, abc.Iterable):
-            if not is_array_like(item):
-                item = np.array(item)
+        if is_list_like(item):
+            item = check_array_indexer(self, item)
             if not len(item):
                 return type(self)(pa.chunked_array([], type=pa.string()))
             elif is_integer_dtype(item):

From 460ea3811e89d34d5ad10c276e2b1f88825ca6d7 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 14 Nov 2020 14:48:47 +0000
Subject: [PATCH 65/78] Revert "remove is_array_like in __getitem__"

This reverts commit 6757feb97071a487400fb9bc9ba44b43daa95c03.
---
 pandas/core/arrays/string_arrow.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index b76af7f886e60..58e076df4ac6b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from collections import abc
 from distutils.version import LooseVersion
 from typing import TYPE_CHECKING, Any, Sequence, Type, Union
 
@@ -9,17 +10,16 @@
 from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.base import ExtensionDtype
-from pandas.core.dtypes.common import (
+from pandas.core.dtypes.dtypes import register_extension_dtype
+from pandas.core.dtypes.missing import isna
+
+from pandas.api.types import (
     is_array_like,
     is_bool_dtype,
     is_integer,
     is_integer_dtype,
-    is_list_like,
     is_scalar,
 )
-from pandas.core.dtypes.dtypes import register_extension_dtype
-from pandas.core.dtypes.missing import isna
-
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import check_array_indexer, validate_indices
@@ -298,9 +298,11 @@ def __getitem__(self, item: Any) -> Any:
         For a boolean mask, return an instance of ``ExtensionArray``, filtered
         to the values where ``item`` is True.
         """
+        item = check_array_indexer(self, item)
 
-        if is_list_like(item):
-            item = check_array_indexer(self, item)
+        if isinstance(item, abc.Iterable):
+            if not is_array_like(item):
+                item = np.array(item)
             if not len(item):
                 return type(self)(pa.chunked_array([], type=pa.string()))
             elif is_integer_dtype(item):

From 7bee5e29309ed1ff99a225b54e719cd7b35c9d72 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 14 Nov 2020 14:50:20 +0000
Subject: [PATCH 66/78] remove just is_array_like in __getitem__

---
 pandas/core/arrays/string_arrow.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 58e076df4ac6b..bc9ab5157b6b1 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -301,8 +301,6 @@ def __getitem__(self, item: Any) -> Any:
         item = check_array_indexer(self, item)
 
         if isinstance(item, abc.Iterable):
-            if not is_array_like(item):
-                item = np.array(item)
             if not len(item):
                 return type(self)(pa.chunked_array([], type=pa.string()))
             elif is_integer_dtype(item):

From 91f37632160c32da39fdd5fc8d34fe50b7944403 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 14 Nov 2020 15:21:03 +0000
Subject: [PATCH 67/78] Update pandas/core/arrays/string_arrow.py

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/core/arrays/string_arrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index bc9ab5157b6b1..7b781b95f9dd9 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -303,7 +303,7 @@ def __getitem__(self, item: Any) -> Any:
         if isinstance(item, abc.Iterable):
             if not len(item):
                 return type(self)(pa.chunked_array([], type=pa.string()))
-            elif is_integer_dtype(item):
+            elif is_integer_dtype(item.dtype):
                 return self.take(item)
             elif is_bool_dtype(item):
                 return type(self)(self.data.filter(item))

From 36b662ab344b4915f806782cda23877de76fe19f Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 14 Nov 2020 15:23:18 +0000
Subject: [PATCH 68/78] Apply suggestions from code review

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/core/arrays/string_arrow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 7b781b95f9dd9..f41e65e55af9b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -300,12 +300,12 @@ def __getitem__(self, item: Any) -> Any:
         """
         item = check_array_indexer(self, item)
 
-        if isinstance(item, abc.Iterable):
+        if isinstance(item, np.ndarray):
             if not len(item):
                 return type(self)(pa.chunked_array([], type=pa.string()))
             elif is_integer_dtype(item.dtype):
                 return self.take(item)
-            elif is_bool_dtype(item):
+            elif is_bool_dtype(item.dtype):
                 return type(self)(self.data.filter(item))
             else:
                 raise IndexError(

From 7a9ef9c83d1b1591db9c8de62df530cb99234d00 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 14 Nov 2020 16:04:35 +0000
Subject: [PATCH 69/78] lint fixup

---
 pandas/core/arrays/string_arrow.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index f41e65e55af9b..36d97c82750da 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-from collections import abc
 from distutils.version import LooseVersion
 from typing import TYPE_CHECKING, Any, Sequence, Type, Union
 

From 5db87883bbab17a518715c044d32f51ec75895f9 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 14 Nov 2020 16:07:47 +0000
Subject: [PATCH 70/78] xfail test_astype_roundtrip

---
 pandas/tests/arrays/string_/test_string.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index fa1580db7fe64..ac0cef6391426 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -121,7 +121,12 @@ def test_string_methods(input, method, dtype, request):
     tm.assert_series_equal(result.astype(object), expected)
 
 
-def test_astype_roundtrip(dtype):
+def test_astype_roundtrip(dtype, request):
+    if dtype == "arrow_string":
+        reason = "ValueError: Could not convert object to NumPy datetime"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
     s = pd.Series(pd.date_range("2000", periods=12))
     s[0] = None
 

From c76c39f6b8fff32ece610271a56a1aa327323f2e Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 14 Nov 2020 16:39:08 +0000
Subject: [PATCH 71/78] update expected in test_arrow_array

---
 pandas/tests/arrays/string_/test_string.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index ac0cef6391426..ae2cae4cd4c53 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -414,21 +414,16 @@ def test_reduce_missing(skipna, dtype):
 
 
 @td.skip_if_no("pyarrow", min_version="0.15.0")
-def test_arrow_array(dtype, request):
+def test_arrow_array(dtype):
     # protocol added in 0.15.0
     import pyarrow as pa
 
-    if dtype == "arrow_string":
-        reason = (
-            "TypeError: Argument 'other' has incorrect type "
-            "(expected pyarrow.lib.ChunkedArray, got pyarrow.lib.StringArray)"
-        )
-        mark = pytest.mark.xfail(reason=reason)
-        request.node.add_marker(mark)
-
     data = pd.array(["a", "b", "c"], dtype=dtype)
     arr = pa.array(data)
     expected = pa.array(list(data), type=pa.string(), from_pandas=True)
+    if dtype == "arrow_string":
+        expected = pa.chunked_array(expected)
+
     assert arr.equals(expected)
 
 

From 24a782dc35382ec024661742176789c9f087f120 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 15 Nov 2020 13:11:59 +0000
Subject: [PATCH 72/78] add fallback for scalar comparison ops

---
 pandas/core/arrays/string_arrow.py         | 10 ++++++-
 pandas/tests/arrays/string_/test_string.py | 35 ++++++++++++++--------
 2 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 36d97c82750da..9262147f801ce 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -416,7 +416,15 @@ def _cmp_method(self, other, op):
         if isinstance(other, ArrowStringArray):
             result = pc_func(self.data, other.data)
         elif is_scalar(other):
-            result = pc_func(self.data, pa.scalar(other))
+            try:
+                result = pc_func(self.data, pa.scalar(other))
+            except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
+                mask = isna(self) | isna(other)
+                valid = ~mask
+                result = np.zeros(len(self), dtype="bool")
+                result[valid] = op(np.array(self)[valid], other)
+                return BooleanArray(result, mask)
+
         else:
             return NotImplemented
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index ae2cae4cd4c53..07e9484994c26 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -247,20 +247,8 @@ def test_add_frame(dtype):
     tm.assert_frame_equal(result, expected)
 
 
-def test_comparison_methods_scalar(all_compare_operators, dtype, request):
-    if dtype == "arrow_string":
-        if all_compare_operators in ["__eq__", "__ne__"]:
-            reason = (
-                "pyarrow.lib.ArrowInvalid: Could not convert <NA> with type NAType: "
-                "did not recognize Python value type when inferring an Arrow data type"
-            )
-        else:
-            reason = "AssertionError: left is not an ExtensionArray"
-        mark = pytest.mark.xfail(reason=reason)
-        request.node.add_marker(mark)
-
+def test_comparison_methods_scalar(all_compare_operators, dtype):
     op_name = all_compare_operators
-
     a = pd.array(["a", None, "c"], dtype=dtype)
     other = "a"
     result = getattr(a, op_name)(other)
@@ -268,11 +256,32 @@ def test_comparison_methods_scalar(all_compare_operators, dtype, request):
     expected = pd.array(expected, dtype="boolean")
     tm.assert_extension_array_equal(result, expected)
 
+
+def test_comparison_methods_scalar_pd_na(all_compare_operators, dtype):
+    op_name = all_compare_operators
+    a = pd.array(["a", None, "c"], dtype=dtype)
     result = getattr(a, op_name)(pd.NA)
     expected = pd.array([None, None, None], dtype="boolean")
     tm.assert_extension_array_equal(result, expected)
 
 
+def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, request):
+    if all_compare_operators not in ["__eq__", "__ne__"]:
+        reason = "comparison op not supported between instances of 'str' and 'int'"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    op_name = all_compare_operators
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    other = 42
+    result = getattr(a, op_name)(other)
+    expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[
+        op_name
+    ]
+    expected = pd.array(expected_data, dtype="boolean")
+    tm.assert_extension_array_equal(result, expected)
+
+
 def test_comparison_methods_array(all_compare_operators, dtype, request):
     if dtype == "arrow_string":
         if all_compare_operators in ["__eq__", "__ne__"]:

From 353bff9de4bb21f5f7bb59006a6540247e7ebec5 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 15 Nov 2020 14:01:41 +0000
Subject: [PATCH 73/78] dispatch to pyarrow for comparion with np.ndarray (1
 failed)

---
 pandas/core/arrays/string_arrow.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 9262147f801ce..680752e6a7e07 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -415,6 +415,8 @@ def _cmp_method(self, other, op):
         pc_func = ARROW_CMP_FUNCS[op.__name__]
         if isinstance(other, ArrowStringArray):
             result = pc_func(self.data, other.data)
+        elif isinstance(other, np.ndarray):
+            result = pc_func(self.data, other)
         elif is_scalar(other):
             try:
                 result = pc_func(self.data, pa.scalar(other))
@@ -424,7 +426,6 @@ def _cmp_method(self, other, op):
                 result = np.zeros(len(self), dtype="bool")
                 result[valid] = op(np.array(self)[valid], other)
                 return BooleanArray(result, mask)
-
         else:
             return NotImplemented
 

From be939474bc0ecb840619da039e02630a7f5daf26 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 16 Nov 2020 12:50:27 +0000
Subject: [PATCH 74/78] fix test_reindex_non_na_fill_value

---
 pandas/core/arrays/string_arrow.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 680752e6a7e07..cb44c5ae71518 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -564,7 +564,8 @@ def take(
             raise IndexError("out of bounds value in 'indices'.")
 
         if allow_fill:
-            if (indices_array < 0).any():
+            fill_mask = indices_array < 0
+            if fill_mask.any():
                 validate_indices(indices_array, len(self.data))
                 # TODO(ARROW-9433): Treat negative indices as NULL
                 indices_array = pa.array(indices_array, mask=indices_array < 0)
@@ -574,7 +575,7 @@ def take(
                 # TODO: ArrowNotImplementedError: Function fill_null has no
                 # kernel matching input types (array[string], scalar[string])
                 result = type(self)(result)
-                result[result.isna()] = fill_value
+                result[fill_mask] = fill_value
                 return result
                 # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
             else:

From 52440a75f506c0b6fbb6b10d8f3b7cfabfc84987 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 16 Nov 2020 13:04:03 +0000
Subject: [PATCH 75/78] use fill_mask in pa indices_array

---
 pandas/core/arrays/string_arrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index cb44c5ae71518..2343c3b2bba4d 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -568,7 +568,7 @@ def take(
             if fill_mask.any():
                 validate_indices(indices_array, len(self.data))
                 # TODO(ARROW-9433): Treat negative indices as NULL
-                indices_array = pa.array(indices_array, mask=indices_array < 0)
+                indices_array = pa.array(indices_array, mask=fill_mask)
                 result = self.data.take(indices_array)
                 if isna(fill_value):
                     return type(self)(result)

From bd05c2c0aff018739be9661206e382cd063cd386 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 16 Nov 2020 13:31:22 +0000
Subject: [PATCH 76/78] add comment to __gettem__

---
 pandas/core/arrays/string_arrow.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 2343c3b2bba4d..5e4d3d5f17185 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -312,6 +312,8 @@ def __getitem__(self, item: Any) -> Any:
                     "boolean arrays are valid indices."
                 )
 
+        # We are not an array indexer, so maybe e.g. a slice or integer
+        # indexer. We dispatch to pyarrow.
         value = self.data[item]
         if isinstance(value, pa.ChunkedArray):
             return type(self)(value)

From 27c8de581aface7d69e1fadc95de379227b084a8 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 17 Nov 2020 10:51:54 +0000
Subject: [PATCH 77/78] add comment on pyarrow compute

---
 pandas/core/arrays/string_arrow.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 5e4d3d5f17185..be3ce7330314c 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -29,6 +29,8 @@
 except ImportError:
     pa = None
 else:
+    # our min supported version of pyarrow, 0.15.1, does not have a compute
+    # module
     try:
         import pyarrow.compute as pc
     except ImportError:

From b6713e95bf36f7cdfa5f12c4cc57b49020a3033a Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Tue, 17 Nov 2020 11:05:58 +0000
Subject: [PATCH 78/78] privatize `data`

---
 pandas/core/arrays/string_arrow.py | 54 +++++++++++++++---------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index be3ce7330314c..184fbc050036b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -183,13 +183,13 @@ class ArrowStringArray(OpsMixin, ExtensionArray):
     def __init__(self, values):
         self._chk_pyarrow_available()
         if isinstance(values, pa.Array):
-            self.data = pa.chunked_array([values])
+            self._data = pa.chunked_array([values])
         elif isinstance(values, pa.ChunkedArray):
-            self.data = values
+            self._data = values
         else:
             raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
 
-        if not pa.types.is_string(self.data.type):
+        if not pa.types.is_string(self._data.type):
             raise ValueError(
                 "ArrowStringArray requires a PyArrow (chunked) array of string type"
             )
@@ -226,7 +226,7 @@ def __array__(self, dtype=None) -> np.ndarray:
 
     def __arrow_array__(self, type=None):
         """Convert myself to a pyarrow Array or ChunkedArray."""
-        return self.data
+        return self._data
 
     def to_numpy(
         self, dtype=None, copy: bool = False, na_value=lib.no_default
@@ -238,7 +238,7 @@ def to_numpy(
 
         if na_value is lib.no_default:
             na_value = self._dtype.na_value
-        result = self.data.__array__(dtype=dtype)
+        result = self._data.__array__(dtype=dtype)
         result[isna(result)] = na_value
         return result
 
@@ -250,7 +250,7 @@ def __len__(self) -> int:
         -------
         length : int
         """
-        return len(self.data)
+        return len(self._data)
 
     @classmethod
     def _from_factorized(cls, values, original):
@@ -271,7 +271,7 @@ def _concat_same_type(cls, to_concat) -> ArrowStringArray:
         """
         return cls(
             pa.chunked_array(
-                [array for ea in to_concat for array in ea.data.iterchunks()]
+                [array for ea in to_concat for array in ea._data.iterchunks()]
             )
         )
 
@@ -307,7 +307,7 @@ def __getitem__(self, item: Any) -> Any:
             elif is_integer_dtype(item.dtype):
                 return self.take(item)
             elif is_bool_dtype(item.dtype):
-                return type(self)(self.data.filter(item))
+                return type(self)(self._data.filter(item))
             else:
                 raise IndexError(
                     "Only integers, slices and integer or "
@@ -316,7 +316,7 @@ def __getitem__(self, item: Any) -> Any:
 
         # We are not an array indexer, so maybe e.g. a slice or integer
         # indexer. We dispatch to pyarrow.
-        value = self.data[item]
+        value = self._data[item]
         if isinstance(value, pa.ChunkedArray):
             return type(self)(value)
         else:
@@ -392,7 +392,7 @@ def nbytes(self) -> int:
         """
         The number of bytes needed to store this object in memory.
         """
-        return self.data.nbytes
+        return self._data.nbytes
 
     def isna(self) -> np.ndarray:
         """
@@ -401,7 +401,7 @@ def isna(self) -> np.ndarray:
         This should return a 1-D array the same length as 'self'.
         """
         # TODO: Implement .to_numpy for ChunkedArray
-        return self.data.is_null().to_pandas().values
+        return self._data.is_null().to_pandas().values
 
     def copy(self) -> ArrowStringArray:
         """
@@ -411,19 +411,19 @@ def copy(self) -> ArrowStringArray:
         -------
         ArrowStringArray
         """
-        return type(self)(self.data)
+        return type(self)(self._data)
 
     def _cmp_method(self, other, op):
         from pandas.arrays import BooleanArray
 
         pc_func = ARROW_CMP_FUNCS[op.__name__]
         if isinstance(other, ArrowStringArray):
-            result = pc_func(self.data, other.data)
+            result = pc_func(self._data, other._data)
         elif isinstance(other, np.ndarray):
-            result = pc_func(self.data, other)
+            result = pc_func(self._data, other)
         elif is_scalar(other):
             try:
-                result = pc_func(self.data, pa.scalar(other))
+                result = pc_func(self._data, pa.scalar(other))
             except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
                 mask = isna(self) | isna(other)
                 valid = ~mask
@@ -469,11 +469,11 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
 
             # Slice data and insert inbetween
             new_data = [
-                *self.data[0:key].chunks,
+                *self._data[0:key].chunks,
                 pa.array([value], type=pa.string()),
-                *self.data[(key + 1) :].chunks,
+                *self._data[(key + 1) :].chunks,
             ]
-            self.data = pa.chunked_array(new_data)
+            self._data = pa.chunked_array(new_data)
         else:
             # Convert to integer indices and iteratively assign.
             # TODO: Make a faster variant of this in Arrow upstream.
@@ -562,18 +562,18 @@ def take(
         else:
             indices_array = indices
 
-        if len(self.data) == 0 and (indices_array >= 0).any():
+        if len(self._data) == 0 and (indices_array >= 0).any():
             raise IndexError("cannot do a non-empty take")
-        if indices_array.size > 0 and indices_array.max() >= len(self.data):
+        if indices_array.size > 0 and indices_array.max() >= len(self._data):
             raise IndexError("out of bounds value in 'indices'.")
 
         if allow_fill:
             fill_mask = indices_array < 0
             if fill_mask.any():
-                validate_indices(indices_array, len(self.data))
+                validate_indices(indices_array, len(self._data))
                 # TODO(ARROW-9433): Treat negative indices as NULL
                 indices_array = pa.array(indices_array, mask=fill_mask)
-                result = self.data.take(indices_array)
+                result = self._data.take(indices_array)
                 if isna(fill_value):
                     return type(self)(result)
                 # TODO: ArrowNotImplementedError: Function fill_null has no
@@ -584,14 +584,14 @@ def take(
                 # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
             else:
                 # Nothing to fill
-                return type(self)(self.data.take(indices))
+                return type(self)(self._data.take(indices))
         else:  # allow_fill=False
             # TODO(ARROW-9432): Treat negative indices as indices from the right.
             if (indices_array < 0).any():
                 # Don't modify in-place
                 indices_array = np.copy(indices_array)
-                indices_array[indices_array < 0] += len(self.data)
-            return type(self)(self.data.take(indices_array))
+                indices_array[indices_array < 0] += len(self._data)
+            return type(self)(self._data.take(indices_array))
 
     def value_counts(self, dropna: bool = True) -> Series:
         """
@@ -612,14 +612,14 @@ def value_counts(self, dropna: bool = True) -> Series:
         """
         from pandas import Index, Series
 
-        vc = self.data.value_counts()
+        vc = self._data.value_counts()
 
         # Index cannot hold ExtensionArrays yet
         index = Index(type(self)(vc.field(0)).astype(object))
         # No missings, so we can adhere to the interface and return a numpy array.
         counts = np.array(vc.field(1))
 
-        if dropna and self.data.null_count > 0:
+        if dropna and self._data.null_count > 0:
             raise NotImplementedError("yo")
 
         return Series(counts, index=index).astype("Int64")