Skip to content

Commit 1444b33

Browse files
JustinZhengBCPingviinituutti
authored andcommitted
GH20591 read_csv raise ValueError for bool columns with missing values (C engine) (pandas-dev#23968)
1 parent d068393 commit 1444b33

File tree

4 files changed

+36
-3
lines changed

4 files changed

+36
-3
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ Backwards incompatible API changes
376376
- :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`)
377377
- Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`)
378378
- ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`)
379+
- :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`)
379380

380381
.. _whatsnew_0240.api_breaking.deps:
381382

pandas/_libs/parsers.pyx

+4
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,10 @@ cdef class TextReader:
12451245
result, na_count = _try_bool_flex(self.parser, i, start, end,
12461246
na_filter, na_hashset,
12471247
self.true_set, self.false_set)
1248+
if user_dtype and na_count is not None:
1249+
if na_count > 0:
1250+
raise ValueError("Bool column has NA values in "
1251+
"column {column}".format(column=i))
12481252
return result, na_count
12491253

12501254
elif dtype.kind == 'S':

pandas/io/parsers.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@
2727

2828
from pandas.core.dtypes.cast import astype_nansafe
2929
from pandas.core.dtypes.common import (
30-
ensure_object, is_categorical_dtype, is_dtype_equal, is_float, is_integer,
31-
is_integer_dtype, is_list_like, is_object_dtype, is_scalar,
32-
is_string_dtype)
30+
ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal,
31+
is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype,
32+
is_scalar, is_string_dtype)
3333
from pandas.core.dtypes.dtypes import CategoricalDtype
3434
from pandas.core.dtypes.missing import isna
3535

@@ -1669,6 +1669,16 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
16691669

16701670
# type specified in dtype param
16711671
if cast_type and not is_dtype_equal(cvals, cast_type):
1672+
try:
1673+
if (is_bool_dtype(cast_type) and
1674+
not is_categorical_dtype(cast_type)
1675+
and na_count > 0):
1676+
raise ValueError("Bool column has NA values in "
1677+
"column {column}"
1678+
.format(column=c))
1679+
except (AttributeError, TypeError):
1680+
# invalid input to is_bool_dtype
1681+
pass
16721682
cvals = self._cast_types(cvals, cast_type, c)
16731683

16741684
result[c] = cvals

pandas/tests/io/parser/test_na_values.py

+18
Original file line numberDiff line numberDiff line change
@@ -421,3 +421,21 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
421421

422422
result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
423423
tm.assert_frame_equal(result, expected)
424+
425+
426+
@pytest.mark.parametrize("data, na_values", [
427+
("false,1\n,1\ntrue", None),
428+
("false,1\nnull,1\ntrue", None),
429+
("false,1\nnan,1\ntrue", None),
430+
("false,1\nfoo,1\ntrue", 'foo'),
431+
("false,1\nfoo,1\ntrue", ['foo']),
432+
("false,1\nfoo,1\ntrue", {'a': 'foo'}),
433+
])
434+
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
435+
parser = all_parsers
436+
msg = ("(Bool column has NA values in column [0a])|"
437+
"(cannot safely convert passed user dtype of "
438+
"bool for object dtyped data in column 0)")
439+
with pytest.raises(ValueError, match=msg):
440+
parser.read_csv(StringIO(data), header=None, names=['a', 'b'],
441+
dtype={'a': 'bool'}, na_values=na_values)

0 commit comments

Comments
 (0)