diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index c74bcb505b6be..62f036f4fcf0f 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -330,6 +330,7 @@ Backwards incompatible API changes - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) +- :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) .. _whatsnew_0240.api_breaking.deps: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 1dc71264c94dd..a459057555cf3 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1245,6 +1245,10 @@ cdef class TextReader: result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, self.true_set, self.false_set) + if user_dtype and na_count is not None: + if na_count > 0: + raise ValueError("Bool column has NA values in " + "column {column}".format(column=i)) return result, na_count elif dtype.kind == 'S': diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7ade35f93a858..aadca1fcb3bef 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -27,9 +27,9 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( - ensure_object, is_categorical_dtype, is_dtype_equal, is_float, is_integer, - is_integer_dtype, is_list_like, is_object_dtype, is_scalar, - is_string_dtype) + ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal, + is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype, + is_scalar, is_string_dtype) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna @@ -1669,6 +1669,16 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, # type specified in dtype param if cast_type and not is_dtype_equal(cvals, cast_type): + try: + if (is_bool_dtype(cast_type) and + not is_categorical_dtype(cast_type) + and na_count > 0): + raise ValueError("Bool column has NA values in " + "column {column}" + .format(column=c)) + except (AttributeError, TypeError): + # invalid input to is_bool_dtype + pass cvals = self._cast_types(cvals, cast_type, c) result[c] = cvals diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 921984bc44e50..1b6d2ee8a062e 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -421,3 +421,21 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data, na_values", [ + ("false,1\n,1\ntrue", None), + ("false,1\nnull,1\ntrue", None), + ("false,1\nnan,1\ntrue", None), + ("false,1\nfoo,1\ntrue", 'foo'), + ("false,1\nfoo,1\ntrue", ['foo']), + ("false,1\nfoo,1\ntrue", {'a': 'foo'}), +]) +def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): + parser = all_parsers + msg = ("(Bool column has NA values in column [0a])|" + "(cannot safely convert passed user dtype of " + "bool for object dtyped data in column 0)") + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=None, names=['a', 'b'], + dtype={'a': 'bool'}, na_values=na_values)