Skip to content

GH20591 read_csv raise ValueError for bool columns with missing values (C engine) #23968

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Dec 2, 2018
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ Backwards incompatible API changes
- :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`)
- Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`)
- ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`)
- :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`)

.. _whatsnew_0240.api_breaking.deps:

Expand Down
4 changes: 4 additions & 0 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1245,6 +1245,10 @@ cdef class TextReader:
result, na_count = _try_bool_flex(self.parser, i, start, end,
na_filter, na_hashset,
self.true_set, self.false_set)
if user_dtype and na_count is not None:
if na_count > 0:
raise ValueError("Bool column has NA values in "
"column {column}".format(column=i))
return result, na_count

elif dtype.kind == 'S':
Expand Down
16 changes: 13 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@

from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
ensure_object, is_categorical_dtype, is_dtype_equal, is_float, is_integer,
is_integer_dtype, is_list_like, is_object_dtype, is_scalar,
is_string_dtype)
ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal,
is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype,
is_scalar, is_string_dtype)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import isna

Expand Down Expand Up @@ -1669,6 +1669,16 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,

# type specified in dtype param
if cast_type and not is_dtype_equal(cvals, cast_type):
try:
if (is_bool_dtype(cast_type) and
not is_categorical_dtype(cast_type)
and na_count):
raise ValueError("Bool column has NA values in "
"column {column}"
.format(column=c))
except (AttributeError, TypeError):
# invalid input to is_bool_dtype
pass
cvals = self._cast_types(cvals, cast_type, c)

result[c] = cvals
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/io/parser/test_na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,3 +421,21 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):

result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("data, na_values", [
("false,1\n,1\ntrue", None),
("false,1\nnull,1\ntrue", None),
("false,1\nnan,1\ntrue", None),
("false,1\nfoo,1\ntrue", 'foo'),
("false,1\nfoo,1\ntrue", ['foo']),
("false,1\nfoo,1\ntrue", {'a': 'foo'}),
])
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
parser = all_parsers
msg = ("(Bool column has NA values in column [0a])|"
"(cannot safely convert passed user dtype of "
"bool for object dtyped data in column 0)")
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=None, names=['a', 'b'],
dtype={'a': 'bool'}, na_values=na_values)