diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 99918aef7fd08..8c43b53f5cdfd 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -144,7 +144,7 @@ type dedicated to boolean data that can hold missing values. The default ``bool`` data type based on a bool-dtype NumPy array, the column can only hold ``True`` or ``False``, and not missing values. This new :class:`~arrays.BooleanArray` can store missing values as well by keeping track of this in a separate mask. -(:issue:`29555`, :issue:`30095`) +(:issue:`29555`, :issue:`30095`, :issue:`31131`) .. ipython:: python diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index eaa17df1235d3..7b12f3348e7e7 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -1,5 +1,5 @@ import numbers -from typing import TYPE_CHECKING, Any, Tuple, Type +from typing import TYPE_CHECKING, Any, List, Tuple, Type import warnings import numpy as np @@ -286,6 +286,23 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False): values, mask = coerce_to_array(scalars, copy=copy) return BooleanArray(values, mask) + @classmethod + def _from_sequence_of_strings( + cls, strings: List[str], dtype=None, copy: bool = False + ): + def map_string(s): + if isna(s): + return s + elif s in ["True", "TRUE", "true"]: + return True + elif s in ["False", "FALSE", "false"]: + return False + else: + raise ValueError(f"{s} cannot be cast to bool") + + scalars = [map_string(x) for x in strings] + return cls._from_sequence(scalars, dtype, copy) + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: data = self._data.astype("int8") data[self._mask] = -1 diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 6e361b2810d54..200446f79af8a 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -251,6 +251,22 @@ def test_coerce_to_numpy_array(): np.array(arr, dtype="bool") +def test_to_boolean_array_from_strings(): + result = BooleanArray._from_sequence_of_strings( + np.array(["True", "False", np.nan], dtype=object) + ) + expected = BooleanArray( + np.array([True, False, False]), np.array([False, False, True]) + ) + + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_strings_invalid_string(): + with pytest.raises(ValueError, match="cannot be cast"): + BooleanArray._from_sequence_of_strings(["donkey"]) + + def test_repr(): df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) expected = " A\n0 True\n1 False\n2 " diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index d08c86bf2ae75..11dcf7f04f76b 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -550,3 +550,35 @@ def test_numeric_dtype(all_parsers, dtype): result = parser.read_csv(StringIO(data), header=None, dtype=dtype) tm.assert_frame_equal(expected, result) + + +def test_boolean_dtype(all_parsers): + parser = all_parsers + data = "\n".join( + [ + "a", + "True", + "TRUE", + "true", + "False", + "FALSE", + "false", + "NaN", + "nan", + "NA", + "null", + "NULL", + ] + ) + + result = parser.read_csv(StringIO(data), dtype="boolean") + expected = pd.DataFrame( + { + "a": pd.array( + [True, True, True, False, False, False, None, None, None, None, None], + dtype="boolean", + ) + } + ) + + tm.assert_frame_equal(result, expected)