BUG: read_csv converting nans to 1 when casting bools to float (#44901)

phofl · web-flow · commit 788ccc4b95bf · 2021-12-17T09:29:24.000-05:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -758,6 +758,7 @@ I/O
 - Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
 - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
 - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
+- Bug in :func:`read_csv` where reading a mixed column of booleans and missing values to a float type results in the missing values becoming 1.0 rather than NaN (:issue:`42808`, :issue:`34120`)
 - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)
 - Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`)
 - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1086,8 +1086,27 @@ cdef class TextReader:
                     break
 
         # we had a fallback parse on the dtype, so now try to cast
-        # only allow safe casts, eg. with a nan you cannot safely cast to int
         if col_res is not None and col_dtype is not None:
+            # If col_res is bool, it might actually be a bool array mixed with NaNs
+            # (see _try_bool_flex()). Usually this would be taken care of using
+            # _maybe_upcast(), but if col_dtype is a floating type we should just
+            # take care of that cast here.
+            if col_res.dtype == np.bool_ and is_float_dtype(col_dtype):
+                mask = col_res.view(np.uint8) == na_values[np.uint8]
+                col_res = col_res.astype(col_dtype)
+                np.putmask(col_res, mask, np.nan)
+                return col_res, na_count
+
+            # NaNs are already cast to True here, so can not use astype
+            if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype):
+                if na_count > 0:
+                    raise ValueError(
+                        f"cannot safely convert passed user dtype of "
+                        f"{col_dtype} for {np.bool_} dtyped data in "
+                        f"column {i} due to NA values"
+                    )
+
+            # only allow safe casts, eg. with a nan you cannot safely cast to int
             try:
                 col_res = col_res.astype(col_dtype, casting='safe')
             except TypeError:
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -130,7 +130,11 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame:
                 frame.index.names = [None] * len(frame.index.names)
 
         if self.kwds.get("dtype") is not None:
-            frame = frame.astype(self.kwds.get("dtype"))
+            try:
+                frame = frame.astype(self.kwds.get("dtype"))
+            except TypeError as e:
+                # GH#44901 reraise to keep api consistent
+                raise ValueError(e)
         return frame
 
     def read(self) -> DataFrame:
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
@@ -17,6 +17,7 @@
 import pandas._testing as tm
 
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
 @skip_pyarrow
@@ -615,3 +616,41 @@ def test_nan_multi_index(all_parsers):
     )
 
     tm.assert_frame_equal(result, expected)
+
+
+@xfail_pyarrow
+def test_bool_and_nan_to_bool(all_parsers):
+    # GH#42808
+    parser = all_parsers
+    data = """0
+NaN
+True
+False
+"""
+    with pytest.raises(ValueError, match="NA values"):
+        parser.read_csv(StringIO(data), dtype="bool")
+
+
+def test_bool_and_nan_to_int(all_parsers):
+    # GH#42808
+    parser = all_parsers
+    data = """0
+NaN
+True
+False
+"""
+    with pytest.raises(ValueError, match="convert|NoneType"):
+        parser.read_csv(StringIO(data), dtype="int")
+
+
+def test_bool_and_nan_to_float(all_parsers):
+    # GH#42808
+    parser = all_parsers
+    data = """0
+NaN
+True
+False
+"""
+    result = parser.read_csv(StringIO(data), dtype="float")
+    expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
+    tm.assert_frame_equal(result, expected)