Skip to content

Commit e7ac30d

Browse files
authored
BUG: read_csv raising ValueError for tru_values/false_values and boolean dtype (#39012)
1 parent eead40e commit e7ac30d

File tree

5 files changed

+55
-6
lines changed

5 files changed

+55
-6
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ I/O
284284
- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)
285285
- Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`)
286286
- :func:`pandas.read_excel` now respects :func:``pandas.set_option`` (:issue:`34252`)
287+
- Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`)
287288
- Bug in :func:``read_json`` when ``orient="split"`` does not maintan numeric string index (:issue:`28556`)
288289

289290
Period

pandas/_libs/parsers.pyx

+9-2
Original file line numberDiff line numberDiff line change
@@ -1084,11 +1084,18 @@ cdef class TextReader:
10841084
elif is_extension_array_dtype(dtype):
10851085
result, na_count = self._string_convert(i, start, end, na_filter,
10861086
na_hashset)
1087+
10871088
array_type = dtype.construct_array_type()
10881089
try:
10891090
# use _from_sequence_of_strings if the class defines it
1090-
result = array_type._from_sequence_of_strings(result,
1091-
dtype=dtype)
1091+
if is_bool_dtype(dtype):
1092+
true_values = [x.decode() for x in self.true_values]
1093+
false_values = [x.decode() for x in self.false_values]
1094+
result = array_type._from_sequence_of_strings(
1095+
result, dtype=dtype, true_values=true_values,
1096+
false_values=false_values)
1097+
else:
1098+
result = array_type._from_sequence_of_strings(result, dtype=dtype)
10921099
except NotImplementedError:
10931100
raise NotImplementedError(
10941101
f"Extension Array: {array_type} must implement "

pandas/core/arrays/boolean.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,8 @@ class BooleanArray(BaseMaskedArray):
257257

258258
# The value used to fill '_data' to avoid upcasting
259259
_internal_fill_value = False
260+
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
261+
_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
260262

261263
def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
262264
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
@@ -282,14 +284,23 @@ def _from_sequence(
282284

283285
@classmethod
284286
def _from_sequence_of_strings(
285-
cls, strings: List[str], *, dtype: Optional[Dtype] = None, copy: bool = False
287+
cls,
288+
strings: List[str],
289+
*,
290+
dtype: Optional[Dtype] = None,
291+
copy: bool = False,
292+
true_values: Optional[List[str]] = None,
293+
false_values: Optional[List[str]] = None,
286294
) -> "BooleanArray":
295+
true_values_union = cls._TRUE_VALUES.union(true_values or [])
296+
false_values_union = cls._FALSE_VALUES.union(false_values or [])
297+
287298
def map_string(s):
288299
if isna(s):
289300
return s
290-
elif s in ["True", "TRUE", "true", "1", "1.0"]:
301+
elif s in true_values_union:
291302
return True
292-
elif s in ["False", "FALSE", "false", "0", "0.0"]:
303+
elif s in false_values_union:
293304
return False
294305
else:
295306
raise ValueError(f"{s} cannot be cast to bool")

pandas/io/parsers.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1812,7 +1812,15 @@ def _cast_types(self, values, cast_type, column):
18121812
cast_type = pandas_dtype(cast_type)
18131813
array_type = cast_type.construct_array_type()
18141814
try:
1815-
return array_type._from_sequence_of_strings(values, dtype=cast_type)
1815+
if is_bool_dtype(cast_type):
1816+
return array_type._from_sequence_of_strings(
1817+
values,
1818+
dtype=cast_type,
1819+
true_values=self.true_values,
1820+
false_values=self.false_values,
1821+
)
1822+
else:
1823+
return array_type._from_sequence_of_strings(values, dtype=cast_type)
18161824
except NotImplementedError as err:
18171825
raise NotImplementedError(
18181826
f"Extension Array: {array_type} must implement "

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+22
Original file line numberDiff line numberDiff line change
@@ -213,3 +213,25 @@ def decimal_number_check(parser, numeric_decimal, thousands, float_precision):
213213
)
214214
val = df.iloc[0, 0]
215215
assert val == numeric_decimal[1]
216+
217+
218+
def test_true_values_cast_to_bool(all_parsers):
219+
# GH#34655
220+
text = """a,b
221+
yes,xxx
222+
no,yyy
223+
1,zzz
224+
0,aaa
225+
"""
226+
parser = all_parsers
227+
result = parser.read_csv(
228+
StringIO(text),
229+
true_values=["yes"],
230+
false_values=["no"],
231+
dtype={"a": "boolean"},
232+
)
233+
expected = DataFrame(
234+
{"a": [True, False, True, False], "b": ["xxx", "yyy", "zzz", "aaa"]}
235+
)
236+
expected["a"] = expected["a"].astype("boolean")
237+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)