ENH: [ArrowStringArray] Enable the string methods for the arrow-backed StringArray (pandas-dev#40708)

simonjayhawkins · yeshsurya · commit 07d8ee366ced · 2021-05-06T14:25:04.000+05:30
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -1,14 +1,12 @@
 from __future__ import annotations
 
 from distutils.version import LooseVersion
-import re
 from typing import (
     TYPE_CHECKING,
     Any,
     Sequence,
     cast,
 )
-import warnings
 
 import numpy as np
 
@@ -27,21 +25,22 @@
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.common import (
-    is_array_like,
-    is_bool_dtype,
-    is_integer,
-    is_integer_dtype,
     is_object_dtype,
-    is_scalar,
     is_string_dtype,
 )
 from pandas.core.dtypes.dtypes import register_extension_dtype
 from pandas.core.dtypes.missing import isna
 
+from pandas.api.types import (
+    is_array_like,
+    is_bool_dtype,
+    is_integer,
+    is_integer_dtype,
+    is_scalar,
+)
 from pandas.core import missing
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
-from pandas.core.arrays.boolean import BooleanDtype
 from pandas.core.indexers import (
     check_array_indexer,
     validate_indices,
@@ -230,21 +229,10 @@ def _chk_pyarrow_available(cls) -> None:
 
     @classmethod
     def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
-        from pandas.core.arrays.masked import BaseMaskedArray
-
         cls._chk_pyarrow_available()
-
-        if isinstance(scalars, BaseMaskedArray):
-            # avoid costly conversion to object dtype in ensure_string_array and
-            # numerical issues with Float32Dtype
-            na_values = scalars._mask
-            result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
-            return cls(pa.array(result, mask=na_values, type=pa.string()))
-
-        # convert non-na-likes to str
-        result = lib.ensure_string_array(scalars, copy=copy)
-        return cls(pa.array(result, type=pa.string(), from_pandas=True))
+        # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value
+        scalars = lib.ensure_string_array(scalars, copy=False)
+        return cls(pa.array(scalars, type=pa.string(), from_pandas=True))
 
     @classmethod
     def _from_sequence_of_strings(
@@ -433,8 +421,10 @@ def fillna(self, value=None, method=None, limit=None):
         if mask.any():
             if method is not None:
                 func = missing.get_fill_func(method)
+                # error: Argument 1 to "to_numpy" of "ArrowStringArray" has incompatible
+                # type "Type[object]"; expected "Union[str, dtype[Any], None]"
                 new_values, _ = func(
-                    self.to_numpy("object"),
+                    self.to_numpy(object),  # type: ignore[arg-type]
                     limit=limit,
                     mask=mask,
                 )
@@ -687,18 +677,13 @@ def value_counts(self, dropna: bool = True) -> Series:
 
         vc = self._data.value_counts()
 
-        values = vc.field(0)
-        counts = vc.field(1)
-        if dropna and self._data.null_count > 0:
-            mask = values.is_valid()
-            values = values.filter(mask)
-            counts = counts.filter(mask)
-
+        # Index cannot hold ExtensionArrays yet
+        index = Index(type(self)(vc.field(0)).astype(object))
         # No missing values so we can adhere to the interface and return a numpy array.
-        counts = np.array(counts)
+        counts = np.array(vc.field(1))
 
-        # Index cannot hold ExtensionArrays yet
-        index = Index(type(self)(values)).astype(object)
+        if dropna and self._data.null_count > 0:
+            raise NotImplementedError("yo")
 
         return Series(counts, index=index).astype("Int64")
 
@@ -751,7 +736,11 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
             if not na_value_is_na:
                 mask[:] = False
 
-            return constructor(result, mask)
+            # error: Argument 1 to "IntegerArray" has incompatible type
+            # "Union[ExtensionArray, ndarray]"; expected "ndarray"
+            # error: Argument 1 to "BooleanArray" has incompatible type
+            # "Union[ExtensionArray, ndarray]"; expected "ndarray"
+            return constructor(result, mask)  # type: ignore[arg-type]
 
         elif is_string_dtype(dtype) and not is_object_dtype(dtype):
             # i.e. StringDtype
@@ -765,148 +754,3 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
             #    or .findall returns a list).
             # -> We don't know the result type. E.g. `.get` can return anything.
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
-
-    def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
-        if flags:
-            return super()._str_contains(pat, case, flags, na, regex)
-
-        if regex:
-            # match_substring_regex added in pyarrow 4.0.0
-            if hasattr(pc, "match_substring_regex") and case:
-                if re.compile(pat).groups:
-                    warnings.warn(
-                        "This pattern has match groups. To actually get the "
-                        "groups, use str.extract.",
-                        UserWarning,
-                        stacklevel=3,
-                    )
-                result = pc.match_substring_regex(self._data, pat)
-            else:
-                return super()._str_contains(pat, case, flags, na, regex)
-        else:
-            if case:
-                result = pc.match_substring(self._data, pat)
-            else:
-                result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
-        result = BooleanDtype().__from_arrow__(result)
-        if not isna(na):
-            result[isna(result)] = bool(na)
-        return result
-
-    def _str_startswith(self, pat, na=None):
-        # match_substring_regex added in pyarrow 4.0.0
-        if hasattr(pc, "match_substring_regex"):
-            result = pc.match_substring_regex(self._data, "^" + re.escape(pat))
-            result = BooleanDtype().__from_arrow__(result)
-            if not isna(na):
-                result[isna(result)] = bool(na)
-            return result
-        else:
-            return super()._str_startswith(pat, na)
-
-    def _str_endswith(self, pat, na=None):
-        # match_substring_regex added in pyarrow 4.0.0
-        if hasattr(pc, "match_substring_regex"):
-            result = pc.match_substring_regex(self._data, re.escape(pat) + "$")
-            result = BooleanDtype().__from_arrow__(result)
-            if not isna(na):
-                result[isna(result)] = bool(na)
-            return result
-        else:
-            return super()._str_endswith(pat, na)
-
-    def _str_isalnum(self):
-        if hasattr(pc, "utf8_is_alnum"):
-            result = pc.utf8_is_alnum(self._data)
-            return BooleanDtype().__from_arrow__(result)
-        else:
-            return super()._str_isalnum()
-
-    def _str_isalpha(self):
-        if hasattr(pc, "utf8_is_alpha"):
-            result = pc.utf8_is_alpha(self._data)
-            return BooleanDtype().__from_arrow__(result)
-        else:
-            return super()._str_isalpha()
-
-    def _str_isdecimal(self):
-        if hasattr(pc, "utf8_is_decimal"):
-            result = pc.utf8_is_decimal(self._data)
-            return BooleanDtype().__from_arrow__(result)
-        else:
-            return super()._str_isdecimal()
-
-    def _str_isdigit(self):
-        if hasattr(pc, "utf8_is_digit"):
-            result = pc.utf8_is_digit(self._data)
-            return BooleanDtype().__from_arrow__(result)
-        else:
-            return super()._str_isdigit()
-
-    def _str_islower(self):
-        if hasattr(pc, "utf8_is_lower"):
-            result = pc.utf8_is_lower(self._data)
-            return BooleanDtype().__from_arrow__(result)
-        else:
-            return super()._str_islower()
-
-    def _str_isnumeric(self):
-        if hasattr(pc, "utf8_is_numeric"):
-            result = pc.utf8_is_numeric(self._data)
-            return BooleanDtype().__from_arrow__(result)
-        else:
-            return super()._str_isnumeric()
-
-    def _str_isspace(self):
-        if hasattr(pc, "utf8_is_space"):
-            result = pc.utf8_is_space(self._data)
-            return BooleanDtype().__from_arrow__(result)
-        else:
-            return super()._str_isspace()
-
-    def _str_istitle(self):
-        if hasattr(pc, "utf8_is_title"):
-            result = pc.utf8_is_title(self._data)
-            return BooleanDtype().__from_arrow__(result)
-        else:
-            return super()._str_istitle()
-
-    def _str_isupper(self):
-        if hasattr(pc, "utf8_is_upper"):
-            result = pc.utf8_is_upper(self._data)
-            return BooleanDtype().__from_arrow__(result)
-        else:
-            return super()._str_isupper()
-
-    def _str_lower(self):
-        return type(self)(pc.utf8_lower(self._data))
-
-    def _str_upper(self):
-        return type(self)(pc.utf8_upper(self._data))
-
-    def _str_strip(self, to_strip=None):
-        if to_strip is None:
-            if hasattr(pc, "utf8_trim_whitespace"):
-                return type(self)(pc.utf8_trim_whitespace(self._data))
-        else:
-            if hasattr(pc, "utf8_trim"):
-                return type(self)(pc.utf8_trim(self._data, characters=to_strip))
-        return super()._str_strip(to_strip)
-
-    def _str_lstrip(self, to_strip=None):
-        if to_strip is None:
-            if hasattr(pc, "utf8_ltrim_whitespace"):
-                return type(self)(pc.utf8_ltrim_whitespace(self._data))
-        else:
-            if hasattr(pc, "utf8_ltrim"):
-                return type(self)(pc.utf8_ltrim(self._data, characters=to_strip))
-        return super()._str_lstrip(to_strip)
-
-    def _str_rstrip(self, to_strip=None):
-        if to_strip is None:
-            if hasattr(pc, "utf8_rtrim_whitespace"):
-                return type(self)(pc.utf8_rtrim_whitespace(self._data))
-        else:
-            if hasattr(pc, "utf8_rtrim"):
-                return type(self)(pc.utf8_rtrim(self._data, characters=to_strip))
-        return super()._str_rstrip(to_strip)
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -1,7 +1,4 @@
-"""
-This module tests the functionality of StringArray and ArrowStringArray.
-Tests for the str accessors are in pandas/tests/strings/test_string_array.py
-"""
+import operator
 
 import numpy as np
 import pytest
@@ -91,6 +88,23 @@ def test_setitem_with_scalar_string(dtype):
     tm.assert_extension_array_equal(arr, expected)
 
 
+@pytest.mark.parametrize(
+    "input, method",
+    [
+        (["a", "b", "c"], operator.methodcaller("capitalize")),
+        (["a b", "a bc. de"], operator.methodcaller("capitalize")),
+    ],
+)
+def test_string_methods(input, method, dtype):
+    a = pd.Series(input, dtype=dtype)
+    b = pd.Series(input, dtype="object")
+    result = method(a.str)
+    expected = method(b.str)
+
+    assert result.dtype.name == dtype
+    tm.assert_series_equal(result.astype(object), expected)
+
+
 def test_astype_roundtrip(dtype, request):
     if dtype == "arrow_string":
         reason = "ValueError: Could not convert object to NumPy datetime"
@@ -476,23 +490,12 @@ def test_arrow_roundtrip(dtype, dtype_object):
     assert result.loc[2, "a"] is pd.NA
 
 
-@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
-def test_arrow_load_from_zero_chunks(dtype, dtype_object):
-    # GH-41040
-    import pyarrow as pa
-
-    data = pd.array([], dtype=dtype)
-    df = pd.DataFrame({"a": data})
-    table = pa.table(df)
-    assert table.field("a").type == "string"
-    # Instantiate the same table with no chunks at all
-    table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
-    result = table.to_pandas()
-    assert isinstance(result["a"].dtype, dtype_object)
-    tm.assert_frame_equal(result, df)
-
+def test_value_counts_na(dtype, request):
+    if dtype == "arrow_string":
+        reason = "TypeError: boolean value of NA is ambiguous"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
 
-def test_value_counts_na(dtype):
     arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
     result = arr.value_counts(dropna=False)
     expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
@@ -503,7 +506,12 @@ def test_value_counts_na(dtype):
     tm.assert_series_equal(result, expected)
 
 
-def test_value_counts_with_normalize(dtype):
+def test_value_counts_with_normalize(dtype, request):
+    if dtype == "arrow_string":
+        reason = "TypeError: boolean value of NA is ambiguous"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
     s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
     result = s.value_counts(normalize=True)
     expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py