Backport PR pandas-dev#31159: ENH: Implement _from_sequence_of_strings for BooleanArray (pandas-dev#31261)

meeseeksmachine · WillAyd · commit 29edc79a7f67 · 2020-01-23T13:31:19.000-08:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -144,7 +144,7 @@ type dedicated to boolean data that can hold missing values. The default
 ``bool`` data type based on a bool-dtype NumPy array, the column can only hold
 ``True`` or ``False``, and not missing values. This new :class:`~arrays.BooleanArray`
 can store missing values as well by keeping track of this in a separate mask.
-(:issue:`29555`, :issue:`30095`)
+(:issue:`29555`, :issue:`30095`, :issue:`31131`)
 
 .. ipython:: python
 
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -1,5 +1,5 @@
 import numbers
-from typing import TYPE_CHECKING, Any, Tuple, Type
+from typing import TYPE_CHECKING, Any, List, Tuple, Type
 import warnings
 
 import numpy as np
@@ -286,6 +286,23 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False):
         values, mask = coerce_to_array(scalars, copy=copy)
         return BooleanArray(values, mask)
 
+    @classmethod
+    def _from_sequence_of_strings(
+        cls, strings: List[str], dtype=None, copy: bool = False
+    ):
+        def map_string(s):
+            if isna(s):
+                return s
+            elif s in ["True", "TRUE", "true"]:
+                return True
+            elif s in ["False", "FALSE", "false"]:
+                return False
+            else:
+                raise ValueError(f"{s} cannot be cast to bool")
+
+        scalars = [map_string(x) for x in strings]
+        return cls._from_sequence(scalars, dtype, copy)
+
     def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
         data = self._data.astype("int8")
         data[self._mask] = -1
diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py
@@ -251,6 +251,22 @@ def test_coerce_to_numpy_array():
         np.array(arr, dtype="bool")
 
 
+def test_to_boolean_array_from_strings():
+    result = BooleanArray._from_sequence_of_strings(
+        np.array(["True", "False", np.nan], dtype=object)
+    )
+    expected = BooleanArray(
+        np.array([True, False, False]), np.array([False, False, True])
+    )
+
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_to_boolean_array_from_strings_invalid_string():
+    with pytest.raises(ValueError, match="cannot be cast"):
+        BooleanArray._from_sequence_of_strings(["donkey"])
+
+
 def test_repr():
     df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")})
     expected = "       A\n0   True\n1  False\n2   <NA>"
diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
@@ -550,3 +550,35 @@ def test_numeric_dtype(all_parsers, dtype):
 
     result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
     tm.assert_frame_equal(expected, result)
+
+
+def test_boolean_dtype(all_parsers):
+    parser = all_parsers
+    data = "\n".join(
+        [
+            "a",
+            "True",
+            "TRUE",
+            "true",
+            "False",
+            "FALSE",
+            "false",
+            "NaN",
+            "nan",
+            "NA",
+            "null",
+            "NULL",
+        ]
+    )
+
+    result = parser.read_csv(StringIO(data), dtype="boolean")
+    expected = pd.DataFrame(
+        {
+            "a": pd.array(
+                [True, True, True, False, False, False, None, None, None, None, None],
+                dtype="boolean",
+            )
+        }
+    )
+
+    tm.assert_frame_equal(result, expected)