Skip to content

Commit 29edc79

Browse files
meeseeksmachineWillAyd
authored andcommitted
Backport PR pandas-dev#31159: ENH: Implement _from_sequence_of_strings for BooleanArray (pandas-dev#31261)
1 parent 01f9742 commit 29edc79

File tree

4 files changed

+67
-2
lines changed

4 files changed

+67
-2
lines changed

doc/source/whatsnew/v1.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ type dedicated to boolean data that can hold missing values. The default
144144
``bool`` data type based on a bool-dtype NumPy array, the column can only hold
145145
``True`` or ``False``, and not missing values. This new :class:`~arrays.BooleanArray`
146146
can store missing values as well by keeping track of this in a separate mask.
147-
(:issue:`29555`, :issue:`30095`)
147+
(:issue:`29555`, :issue:`30095`, :issue:`31131`)
148148

149149
.. ipython:: python
150150

pandas/core/arrays/boolean.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import numbers
2-
from typing import TYPE_CHECKING, Any, Tuple, Type
2+
from typing import TYPE_CHECKING, Any, List, Tuple, Type
33
import warnings
44

55
import numpy as np
@@ -286,6 +286,23 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False):
286286
values, mask = coerce_to_array(scalars, copy=copy)
287287
return BooleanArray(values, mask)
288288

289+
@classmethod
290+
def _from_sequence_of_strings(
291+
cls, strings: List[str], dtype=None, copy: bool = False
292+
):
293+
def map_string(s):
294+
if isna(s):
295+
return s
296+
elif s in ["True", "TRUE", "true"]:
297+
return True
298+
elif s in ["False", "FALSE", "false"]:
299+
return False
300+
else:
301+
raise ValueError(f"{s} cannot be cast to bool")
302+
303+
scalars = [map_string(x) for x in strings]
304+
return cls._from_sequence(scalars, dtype, copy)
305+
289306
def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
290307
data = self._data.astype("int8")
291308
data[self._mask] = -1

pandas/tests/arrays/test_boolean.py

+16
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,22 @@ def test_coerce_to_numpy_array():
251251
np.array(arr, dtype="bool")
252252

253253

254+
def test_to_boolean_array_from_strings():
255+
result = BooleanArray._from_sequence_of_strings(
256+
np.array(["True", "False", np.nan], dtype=object)
257+
)
258+
expected = BooleanArray(
259+
np.array([True, False, False]), np.array([False, False, True])
260+
)
261+
262+
tm.assert_extension_array_equal(result, expected)
263+
264+
265+
def test_to_boolean_array_from_strings_invalid_string():
266+
with pytest.raises(ValueError, match="cannot be cast"):
267+
BooleanArray._from_sequence_of_strings(["donkey"])
268+
269+
254270
def test_repr():
255271
df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")})
256272
expected = " A\n0 True\n1 False\n2 <NA>"

pandas/tests/io/parser/test_dtypes.py

+32
Original file line numberDiff line numberDiff line change
@@ -550,3 +550,35 @@ def test_numeric_dtype(all_parsers, dtype):
550550

551551
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
552552
tm.assert_frame_equal(expected, result)
553+
554+
555+
def test_boolean_dtype(all_parsers):
556+
parser = all_parsers
557+
data = "\n".join(
558+
[
559+
"a",
560+
"True",
561+
"TRUE",
562+
"true",
563+
"False",
564+
"FALSE",
565+
"false",
566+
"NaN",
567+
"nan",
568+
"NA",
569+
"null",
570+
"NULL",
571+
]
572+
)
573+
574+
result = parser.read_csv(StringIO(data), dtype="boolean")
575+
expected = pd.DataFrame(
576+
{
577+
"a": pd.array(
578+
[True, True, True, False, False, False, None, None, None, None, None],
579+
dtype="boolean",
580+
)
581+
}
582+
)
583+
584+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)