diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2fe289a5f7c35..e6835e793cdcb 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -645,6 +645,7 @@ I/O - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`) - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) +- Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) Period diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index d096e9008112b..116217e8c3ec1 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -705,7 +705,7 @@ def _infer_types(self, values, na_values, try_num_bool=True): # error: Argument 2 to "isin" has incompatible type "List[Any]"; expected # "Union[Union[ExtensionArray, ndarray], Index, Series]" mask = algorithms.isin(values, list(na_values)) # type: ignore[arg-type] - na_count = mask.sum() + na_count = mask.astype("uint8", copy=False).sum() if na_count > 0: if is_integer_dtype(values): values = values.astype(np.float64) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 646cb2029919d..7315dcc0c4c07 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -297,3 +297,27 @@ def test_multiindex_columns_index_col_with_data(all_parsers): index=Index(["data"]), ) tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_infer_types_boolean_sum(all_parsers): + # GH#44079 + parser = all_parsers + result = parser.read_csv( + StringIO("0,1"), + names=["a", "b"], + index_col=["a"], + dtype={"a": "UInt8"}, + ) + expected = DataFrame( + data={ + "a": [ + 0, + ], + "b": [1], + } + ).set_index("a") + # Not checking index type now, because the C parser will return a + # index column of dtype 'object', and the Python parser will return a + # index column of dtype 'int64'. + tm.assert_frame_equal(result, expected, check_index_type=False)