REF: Move value_counts, isin to ArrowExtensionArray

mroeschke · mroeschke · commit 03a8009c98dc · 2022-03-20T20:06:17.000-07:00
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -32,6 +32,7 @@
 from pandas.compat import (
     pa_version_under1p01,
     pa_version_under2p0,
+    pa_version_under3p0,
     pa_version_under5p0,
 )
 from pandas.errors import AbstractMethodError
@@ -86,6 +87,8 @@
         NumpyValueArrayLike,
     )
 
+    from pandas import Series
+
 
 def ravel_compat(meth: F) -> F:
     """
@@ -544,6 +547,7 @@ class ArrowExtensionArray(ExtensionArray):
     """
 
     _data: pa.ChunkedArray
+    _pa_dtype: pa.DataType()
 
     def __init__(self, values: pa.ChunkedArray) -> None:
         self._data = values
@@ -599,6 +603,70 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
         """
         return type(self)(self._data)
 
+    def isin(self, values):
+        if pa_version_under2p0:
+            return super().isin(values)
+
+        value_set = [
+            pa_scalar.as_py()
+            for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
+            if pa_scalar.type in (self._pa_dtype, pa.null())
+        ]
+
+        # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
+        # for null values, so we short-circuit to return all False array.
+        if not len(value_set):
+            return np.zeros(len(self), dtype=bool)
+
+        kwargs = {}
+        if pa_version_under3p0:
+            # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
+            # with unexpected keyword argument in pyarrow 3.0.0+
+            kwargs["skip_null"] = True
+
+        result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs)
+        # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
+        # to False
+        return np.array(result, dtype=np.bool_)
+
+    def value_counts(self, dropna: bool = True) -> Series:
+        """
+        Return a Series containing counts of each unique value.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            Don't include counts of missing values.
+
+        Returns
+        -------
+        counts : Series
+
+        See Also
+        --------
+        Series.value_counts
+        """
+        from pandas import (
+            Index,
+            Series,
+        )
+
+        vc = self._data.value_counts()
+
+        values = vc.field(0)
+        counts = vc.field(1)
+        if dropna and self._data.null_count > 0:
+            mask = values.is_valid()
+            values = values.filter(mask)
+            counts = counts.filter(mask)
+
+        # No missing values so we can adhere to the interface and return a numpy array.
+        counts = np.array(counts)
+
+        index = Index(type(self)(values))
+
+        return Series(counts, index=index).astype("Int64")
+
     @classmethod
     def _concat_same_type(
         cls: type[ArrowExtensionArrayT], to_concat
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -3,7 +3,6 @@
 from collections.abc import Callable  # noqa: PDF001
 import re
 from typing import (
-    TYPE_CHECKING,
     Any,
     Union,
     overload,
@@ -28,7 +27,6 @@
 from pandas.compat import (
     pa_version_under1p01,
     pa_version_under2p0,
-    pa_version_under3p0,
     pa_version_under4p0,
 )
 from pandas.util._decorators import doc
@@ -77,9 +75,6 @@
     }
 
 
-if TYPE_CHECKING:
-    from pandas import Series
-
 ArrowStringScalarOrNAT = Union[str, libmissing.NAType]
 
 
@@ -140,6 +135,8 @@ class ArrowStringArray(
     Length: 4, dtype: string
     """
 
+    _pa_dtype = pa.string()
+
     def __init__(self, values) -> None:
         self._dtype = StringDtype(storage="pyarrow")
         if isinstance(values, pa.Array):
@@ -170,11 +167,11 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
             na_values = scalars._mask
             result = scalars._data
             result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
-            return cls(pa.array(result, mask=na_values, type=pa.string()))
+            return cls(pa.array(result, mask=na_values, type=cls._pa_dtype))
 
         # convert non-na-likes to str
         result = lib.ensure_string_array(scalars, copy=copy)
-        return cls(pa.array(result, type=pa.string(), from_pandas=True))
+        return cls(pa.array(result, type=cls._pa_dtype, from_pandas=True))
 
     @classmethod
     def _from_sequence_of_strings(
@@ -269,7 +266,7 @@ def __getitem__(
 
         if isinstance(item, np.ndarray):
             if not len(item):
-                return type(self)(pa.chunked_array([], type=pa.string()))
+                return type(self)(pa.chunked_array([], type=self._pa_dtype))
             elif is_integer_dtype(item.dtype):
                 return self.take(item)
             elif is_bool_dtype(item.dtype):
@@ -455,70 +452,6 @@ def take(
                 indices_array[indices_array < 0] += len(self._data)
             return type(self)(self._data.take(indices_array))
 
-    def isin(self, values):
-        if pa_version_under2p0:
-            return super().isin(values)
-
-        value_set = [
-            pa_scalar.as_py()
-            for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
-            if pa_scalar.type in (pa.string(), pa.null())
-        ]
-
-        # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
-        # for null values, so we short-circuit to return all False array.
-        if not len(value_set):
-            return np.zeros(len(self), dtype=bool)
-
-        kwargs = {}
-        if pa_version_under3p0:
-            # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
-            # with unexpected keyword argument in pyarrow 3.0.0+
-            kwargs["skip_null"] = True
-
-        result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs)
-        # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
-        # to False
-        return np.array(result, dtype=np.bool_)
-
-    def value_counts(self, dropna: bool = True) -> Series:
-        """
-        Return a Series containing counts of each unique value.
-
-        Parameters
-        ----------
-        dropna : bool, default True
-            Don't include counts of missing values.
-
-        Returns
-        -------
-        counts : Series
-
-        See Also
-        --------
-        Series.value_counts
-        """
-        from pandas import (
-            Index,
-            Series,
-        )
-
-        vc = self._data.value_counts()
-
-        values = vc.field(0)
-        counts = vc.field(1)
-        if dropna and self._data.null_count > 0:
-            mask = values.is_valid()
-            values = values.filter(mask)
-            counts = counts.filter(mask)
-
-        # No missing values so we can adhere to the interface and return a numpy array.
-        counts = np.array(counts)
-
-        index = Index(type(self)(values))
-
-        return Series(counts, index=index).astype("Int64")
-
     def astype(self, dtype, copy: bool = True):
         dtype = pandas_dtype(dtype)
 
@@ -590,7 +523,7 @@ def _str_map(
             result = lib.map_infer_mask(
                 arr, f, mask.view("uint8"), convert=False, na_value=na_value
             )
-            result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True)
+            result = pa.array(result, mask=mask, type=self._pa_dtype, from_pandas=True)
             return type(self)(result)
         else:
             # This is when the result type is object. We reach this when