API: Return BoolArray for string ops when backed by StringArray (#30239)

TomAugspurger · web-flow · commit 5b25df2b578c · 2019-12-19T11:09:20.000-06:00
* API: Return BoolArray for string ops
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -74,6 +74,7 @@ These are places where the behavior of ``StringDtype`` objects differ from
 l. For ``StringDtype``, :ref:`string accessor methods<api.series.str>`
    that return **numeric** output will always return a nullable integer dtype,
    rather than either int or float dtype, depending on the presence of NA values.
+   Methods returning **boolean** output will return a nullable boolean dtype.
 
    .. ipython:: python
 
@@ -89,7 +90,13 @@ l. For ``StringDtype``, :ref:`string accessor methods<api.series.str>`
       s.astype(object).str.count("a")
       s.astype(object).dropna().str.count("a")
 
-   When NA values are present, the output dtype is float64.
+   When NA values are present, the output dtype is float64. Similarly for
+   methods returning boolean values.
+
+   .. ipython:: python
+
+      s.str.isdigit()
+      s.str.match("a")
 
 2. Some string methods, like :meth:`Series.str.decode` are not available
    on ``StringArray`` because ``StringArray`` only holds strings, not
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -2,7 +2,7 @@
 from functools import wraps
 import re
 import textwrap
-from typing import TYPE_CHECKING, Any, Callable, Dict, List
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union
 import warnings
 
 import numpy as np
@@ -142,7 +142,7 @@ def _map_stringarray(
         The value to use for missing values. By default, this is
         the original value (NA).
     dtype : Dtype
-        The result dtype to use. Specifying this aviods an intermediate
+        The result dtype to use. Specifying this avoids an intermediate
         object-dtype allocation.
 
     Returns
@@ -152,14 +152,20 @@ def _map_stringarray(
         an ndarray.
 
     """
-    from pandas.arrays import IntegerArray, StringArray
+    from pandas.arrays import IntegerArray, StringArray, BooleanArray
 
     mask = isna(arr)
 
     assert isinstance(arr, StringArray)
     arr = np.asarray(arr)
 
-    if is_integer_dtype(dtype):
+    if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+        constructor: Union[Type[IntegerArray], Type[BooleanArray]]
+        if is_integer_dtype(dtype):
+            constructor = IntegerArray
+        else:
+            constructor = BooleanArray
+
         na_value_is_na = isna(na_value)
         if na_value_is_na:
             na_value = 1
@@ -169,21 +175,20 @@ def _map_stringarray(
             mask.view("uint8"),
             convert=False,
             na_value=na_value,
-            dtype=np.dtype("int64"),
+            dtype=np.dtype(dtype),
         )
 
         if not na_value_is_na:
             mask[:] = False
 
-        return IntegerArray(result, mask)
+        return constructor(result, mask)
 
     elif is_string_dtype(dtype) and not is_object_dtype(dtype):
         # i.e. StringDtype
         result = lib.map_infer_mask(
             arr, func, mask.view("uint8"), convert=False, na_value=na_value
         )
         return StringArray(result)
-    # TODO: BooleanArray
     else:
         # This is when the result type is object. We reach this when
         # -> We know the result type is truly object (e.g. .encode returns bytes
@@ -299,7 +304,7 @@ def str_count(arr, pat, flags=0):
     """
     regex = re.compile(pat, flags=flags)
     f = lambda x: len(regex.findall(x))
-    return _na_map(f, arr, dtype=int)
+    return _na_map(f, arr, dtype="int64")
 
 
 def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
@@ -1365,7 +1370,7 @@ def str_find(arr, sub, start=0, end=None, side="left"):
     else:
         f = lambda x: getattr(x, method)(sub, start, end)
 
-    return _na_map(f, arr, dtype=int)
+    return _na_map(f, arr, dtype="int64")
 
 
 def str_index(arr, sub, start=0, end=None, side="left"):
@@ -1385,7 +1390,7 @@ def str_index(arr, sub, start=0, end=None, side="left"):
     else:
         f = lambda x: getattr(x, method)(sub, start, end)
 
-    return _na_map(f, arr, dtype=int)
+    return _na_map(f, arr, dtype="int64")
 
 
 def str_pad(arr, width, side="left", fillchar=" "):
@@ -3210,7 +3215,7 @@ def rindex(self, sub, start=0, end=None):
         len,
         docstring=_shared_docs["len"],
         forbidden_types=None,
-        dtype=int,
+        dtype="int64",
         returns_string=False,
     )
 
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -1825,7 +1825,7 @@ def test_extractall_same_as_extract_subject_index(self):
 
     def test_empty_str_methods(self):
         empty_str = empty = Series(dtype=object)
-        empty_int = Series(dtype=int)
+        empty_int = Series(dtype="int64")
         empty_bool = Series(dtype=bool)
         empty_bytes = Series(dtype=object)
 
@@ -3526,6 +3526,12 @@ def test_string_array(any_string_method):
             assert result.dtype == "string"
             result = result.astype(object)
 
+        elif expected.dtype == "object" and lib.is_bool_array(
+            expected.values, skipna=True
+        ):
+            assert result.dtype == "boolean"
+            result = result.astype(object)
+
         elif expected.dtype == "float" and expected.isna().any():
             assert result.dtype == "Int64"
             result = result.astype("float")
@@ -3551,3 +3557,19 @@ def test_string_array_numeric_integer_array(method, expected):
     result = getattr(s.str, method)("a")
     expected = Series(expected, dtype="Int64")
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method,expected",
+    [
+        ("isdigit", [False, None, True]),
+        ("isalpha", [True, None, False]),
+        ("isalnum", [True, None, True]),
+        ("isdigit", [False, None, True]),
+    ],
+)
+def test_string_array_boolean_array(method, expected):
+    s = Series(["a", None, "1"], dtype="string")
+    result = getattr(s.str, method)()
+    expected = Series(expected, dtype="boolean")
+    tm.assert_series_equal(result, expected)