Skip to content

Commit 788ccc4

Browse files
authored
BUG: read_csv converting nans to 1 when casting bools to float (#44901)
1 parent e9ab6de commit 788ccc4

File tree

4 files changed

+65
-2
lines changed

4 files changed

+65
-2
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,7 @@ I/O
758758
- Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
759759
- Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
760760
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
761+
- Bug in :func:`read_csv` where reading a mixed column of booleans and missing values to a float type results in the missing values becoming 1.0 rather than NaN (:issue:`42808`, :issue:`34120`)
761762
- Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)
762763
- Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`)
763764
- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)

pandas/_libs/parsers.pyx

+20-1
Original file line numberDiff line numberDiff line change
@@ -1086,8 +1086,27 @@ cdef class TextReader:
10861086
break
10871087

10881088
# we had a fallback parse on the dtype, so now try to cast
1089-
# only allow safe casts, eg. with a nan you cannot safely cast to int
10901089
if col_res is not None and col_dtype is not None:
1090+
# If col_res is bool, it might actually be a bool array mixed with NaNs
1091+
# (see _try_bool_flex()). Usually this would be taken care of using
1092+
# _maybe_upcast(), but if col_dtype is a floating type we should just
1093+
# take care of that cast here.
1094+
if col_res.dtype == np.bool_ and is_float_dtype(col_dtype):
1095+
mask = col_res.view(np.uint8) == na_values[np.uint8]
1096+
col_res = col_res.astype(col_dtype)
1097+
np.putmask(col_res, mask, np.nan)
1098+
return col_res, na_count
1099+
1100+
# NaNs are already cast to True here, so can not use astype
1101+
if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype):
1102+
if na_count > 0:
1103+
raise ValueError(
1104+
f"cannot safely convert passed user dtype of "
1105+
f"{col_dtype} for {np.bool_} dtyped data in "
1106+
f"column {i} due to NA values"
1107+
)
1108+
1109+
# only allow safe casts, eg. with a nan you cannot safely cast to int
10911110
try:
10921111
col_res = col_res.astype(col_dtype, casting='safe')
10931112
except TypeError:

pandas/io/parsers/arrow_parser_wrapper.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,11 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame:
130130
frame.index.names = [None] * len(frame.index.names)
131131

132132
if self.kwds.get("dtype") is not None:
133-
frame = frame.astype(self.kwds.get("dtype"))
133+
try:
134+
frame = frame.astype(self.kwds.get("dtype"))
135+
except TypeError as e:
136+
# GH#44901 reraise to keep api consistent
137+
raise ValueError(e)
134138
return frame
135139

136140
def read(self) -> DataFrame:

pandas/tests/io/parser/test_na_values.py

+39
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import pandas._testing as tm
1818

1919
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
20+
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
2021

2122

2223
@skip_pyarrow
@@ -615,3 +616,41 @@ def test_nan_multi_index(all_parsers):
615616
)
616617

617618
tm.assert_frame_equal(result, expected)
619+
620+
621+
@xfail_pyarrow
622+
def test_bool_and_nan_to_bool(all_parsers):
623+
# GH#42808
624+
parser = all_parsers
625+
data = """0
626+
NaN
627+
True
628+
False
629+
"""
630+
with pytest.raises(ValueError, match="NA values"):
631+
parser.read_csv(StringIO(data), dtype="bool")
632+
633+
634+
def test_bool_and_nan_to_int(all_parsers):
635+
# GH#42808
636+
parser = all_parsers
637+
data = """0
638+
NaN
639+
True
640+
False
641+
"""
642+
with pytest.raises(ValueError, match="convert|NoneType"):
643+
parser.read_csv(StringIO(data), dtype="int")
644+
645+
646+
def test_bool_and_nan_to_float(all_parsers):
647+
# GH#42808
648+
parser = all_parsers
649+
data = """0
650+
NaN
651+
True
652+
False
653+
"""
654+
result = parser.read_csv(StringIO(data), dtype="float")
655+
expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
656+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)