GH20591 read_csv raise ValueError for bool columns with missing values (C engine) (pandas-dev#23968)

JustinZhengBC · Pingviinituutti · commit 1444b33f0dce · 2019-02-28T10:26:56.000+02:00
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -376,6 +376,7 @@ Backwards incompatible API changes
 - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`)
 - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`)
 - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`)
+- :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`)
 
 .. _whatsnew_0240.api_breaking.deps:
 
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1245,6 +1245,10 @@ cdef class TextReader:
             result, na_count = _try_bool_flex(self.parser, i, start, end,
                                               na_filter, na_hashset,
                                               self.true_set, self.false_set)
+            if user_dtype and na_count is not None:
+                if na_count > 0:
+                    raise ValueError("Bool column has NA values in "
+                                     "column {column}".format(column=i))
             return result, na_count
 
         elif dtype.kind == 'S':
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -27,9 +27,9 @@
 
 from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.dtypes.common import (
-    ensure_object, is_categorical_dtype, is_dtype_equal, is_float, is_integer,
-    is_integer_dtype, is_list_like, is_object_dtype, is_scalar,
-    is_string_dtype)
+    ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal,
+    is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype,
+    is_scalar, is_string_dtype)
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
 
@@ -1669,6 +1669,16 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
 
                 # type specified in dtype param
                 if cast_type and not is_dtype_equal(cvals, cast_type):
+                    try:
+                        if (is_bool_dtype(cast_type) and
+                                not is_categorical_dtype(cast_type)
+                                and na_count > 0):
+                            raise ValueError("Bool column has NA values in "
+                                             "column {column}"
+                                             .format(column=c))
+                    except (AttributeError, TypeError):
+                        # invalid input to is_bool_dtype
+                        pass
                     cvals = self._cast_types(cvals, cast_type, c)
 
             result[c] = cvals
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
@@ -421,3 +421,21 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
 
     result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("data, na_values", [
+    ("false,1\n,1\ntrue", None),
+    ("false,1\nnull,1\ntrue", None),
+    ("false,1\nnan,1\ntrue", None),
+    ("false,1\nfoo,1\ntrue", 'foo'),
+    ("false,1\nfoo,1\ntrue", ['foo']),
+    ("false,1\nfoo,1\ntrue", {'a': 'foo'}),
+])
+def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
+    parser = all_parsers
+    msg = ("(Bool column has NA values in column [0a])|"
+           "(cannot safely convert passed user dtype of "
+           "bool for object dtyped data in column 0)")
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(StringIO(data), header=None, names=['a', 'b'],
+                        dtype={'a': 'bool'}, na_values=na_values)