Skip to content

BUG: read_csv raising ValueError for tru_values/false_values and boolean dtype #39012

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jan 9, 2021
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ I/O
- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)
- Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`)
- :func:`pandas.read_excel` now respects :func:``pandas.set_option`` (:issue:`34252`)
- Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`)
- Bug in :func:``read_json`` when ``orient="split"`` does not maintan numeric string index (:issue:`28556`)

Period
Expand Down
11 changes: 9 additions & 2 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1084,11 +1084,18 @@ cdef class TextReader:
elif is_extension_array_dtype(dtype):
result, na_count = self._string_convert(i, start, end, na_filter,
na_hashset)

array_type = dtype.construct_array_type()
try:
# use _from_sequence_of_strings if the class defines it
result = array_type._from_sequence_of_strings(result,
dtype=dtype)
if is_bool_dtype(dtype):
true_values = [x.decode() for x in self.true_values]
false_values = [x.decode() for x in self.false_values]
result = array_type._from_sequence_of_strings(
result, dtype=dtype, true_values=true_values,
false_values=false_values)
else:
result = array_type._from_sequence_of_strings(result, dtype=dtype)
except NotImplementedError:
raise NotImplementedError(
f"Extension Array: {array_type} must implement "
Expand Down
23 changes: 20 additions & 3 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,8 @@ class BooleanArray(BaseMaskedArray):

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = False
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these could also be module level (doesn't matter), note in the parser itself we only have

    object _true_values = [b'True', b'TRUE', b'true']
    object _false_values = [b'False', b'FALSE', b'false']

but yeah i think we agreed the 1/1.0 are fine.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case let's keep it there

_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}

def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
Expand All @@ -282,14 +284,29 @@ def _from_sequence(

@classmethod
def _from_sequence_of_strings(
cls, strings: List[str], *, dtype: Optional[Dtype] = None, copy: bool = False
cls,
strings: List[str],
*,
dtype: Optional[Dtype] = None,
copy: bool = False,
true_values: Optional[List[str]] = None,
false_values: Optional[List[str]] = None,
) -> "BooleanArray":
if true_values is not None:
true_values_union = cls._TRUE_VALUES.union(true_values)
else:
true_values_union = cls._TRUE_VALUES
if false_values is not None:
false_values_union = cls._FALSE_VALUES.union(false_values)
else:
false_values_union = cls._FALSE_VALUES

def map_string(s):
if isna(s):
return s
elif s in ["True", "TRUE", "true", "1", "1.0"]:
elif s in true_values_union:
return True
elif s in ["False", "FALSE", "false", "0", "0.0"]:
elif s in false_values_union:
return False
else:
raise ValueError(f"{s} cannot be cast to bool")
Expand Down
10 changes: 9 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1812,7 +1812,15 @@ def _cast_types(self, values, cast_type, column):
cast_type = pandas_dtype(cast_type)
array_type = cast_type.construct_array_type()
try:
return array_type._from_sequence_of_strings(values, dtype=cast_type)
if is_bool_dtype(cast_type):
return array_type._from_sequence_of_strings(
values,
dtype=cast_type,
true_values=self.true_values,
false_values=self.false_values,
)
else:
return array_type._from_sequence_of_strings(values, dtype=cast_type)
except NotImplementedError as err:
raise NotImplementedError(
f"Extension Array: {array_type} must implement "
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,25 @@ def decimal_number_check(parser, numeric_decimal, thousands, float_precision):
)
val = df.iloc[0, 0]
assert val == numeric_decimal[1]


def test_true_values_cast_to_bool(all_parsers):
# GH#34655
text = """a,b
yes,xxx
no,yyy
1,zzz
0,aaa
"""
parser = all_parsers
result = parser.read_csv(
StringIO(text),
true_values=["yes"],
false_values=["no"],
dtype={"a": "boolean"},
)
expected = DataFrame(
{"a": [True, False, True, False], "b": ["xxx", "yyy", "zzz", "aaa"]}
)
expected["a"] = expected["a"].astype("boolean")
tm.assert_frame_equal(result, expected)