diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 10aef954a3475..012395b9afe1c 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -285,6 +285,15 @@ def time_read_uint64_na_values(self): ) +class ReadUint8Integers(StringIORewind): + def setup(self): + arr = np.tile(np.arange(256, dtype="uint8"), 50) + self.data1 = StringIO("\n".join(arr.astype(str).tolist())) + + def time_read_uint8(self): + read_csv(self.data(self.data1), header=None, names=["foo"], dtype="uint8") + + class ReadCSVThousands(BaseIO): fname = "__test__.csv" @@ -567,7 +576,7 @@ def setup(self): self.StringIO_input = StringIO(data) def time_read_csv_index_col(self): - read_csv(self.StringIO_input, index_col="a") + read_csv(self.data(self.StringIO_input), index_col="a") from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a1c374db91f8b..3b25b5dde4aba 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1136,6 +1136,8 @@ I/O - Bug in :func:`read_parquet` with ``use_nullable_dtypes=True`` where ``float64`` dtype was returned instead of nullable ``Float64`` dtype (:issue:`45694`) - Bug in :meth:`DataFrame.to_json` where ``PeriodDtype`` would not make the serialization roundtrip when read back with :meth:`read_json` (:issue:`44720`) - Bug in :func:`read_xml` when reading XML files with Chinese character tags and would raise ``XMLSyntaxError`` (:issue:`47902`) +- Bug in :func:`read_csv` with specified numpy integer ``dtype`` can cause silent overflow or unexpected return dtype (:issue:`47167`) +- Bug in :func:`read_csv` with specified numpy integer ``dtype`` and ``engine="python"`` can cause silent lossy float coercion (:issue:`47167`) Period ^^^^^^ diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 1941cfde4acb9..3d713267b0edc 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1189,19 +1189,32 @@ cdef class TextReader: return result, na_count elif is_integer_dtype(dtype): - try: - result, na_count = _try_int64(self.parser, i, start, - end, na_filter, na_hashset) - if user_dtype and na_count is not None: - if na_count > 0: - raise ValueError(f"Integer column has NA values in column {i}") - except OverflowError: - result = _try_uint64(self.parser, i, start, end, - na_filter, na_hashset) + if user_dtype and dtype == "uint64": + result = _try_uint64(self.parser, i, start, + end, na_filter, na_hashset) na_count = 0 + else: + try: + result, na_count = _try_int64(self.parser, i, start, + end, na_filter, na_hashset) + except OverflowError as err: + if user_dtype and dtype == "int64": + raise err + result = _try_uint64(self.parser, i, start, + end, na_filter, na_hashset) + na_count = 0 + else: + if user_dtype and (na_count is not None) and (na_count > 0): + raise ValueError(f"Integer column has NA values in column {i}") - if result is not None and dtype != "int64": - result = result.astype(dtype) + if result is not None and dtype not in ("int64", "uint64"): + casted = result.astype(dtype) + if (casted == result).all(): + result = casted + else: + raise TypeError( + f"cannot safely cast non-equivalent {result.dtype} to {dtype}" + ) return result, na_count diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e6f4830846c77..eccb9cd12d0e3 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -49,6 +49,7 @@ from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.astype import astype_nansafe +from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, @@ -844,8 +845,11 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi values = values.astype(cast_type, copy=False) else: try: - values = astype_nansafe(values, cast_type, copy=True, skipna=True) - except ValueError as err: + if is_integer_dtype(cast_type): + values = maybe_cast_to_integer_array(values, cast_type, copy=True) + else: + values = astype_nansafe(values, cast_type, copy=True, skipna=True) + except (ValueError, OverflowError) as err: raise ValueError( f"Unable to convert column {column} to type {cast_type}" ) from err diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index e3159ef3e6a42..4d6d4cc4eb569 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -12,6 +12,11 @@ Series, ) import pandas._testing as tm +from pandas.api.types import ( + is_extension_array_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) # GH#43650: Some expected failures with the pyarrow engine can occasionally # cause a deadlock instead, so we skip these instead of xfailing @@ -110,6 +115,98 @@ def test_integer_overflow_bug(all_parsers, sep): tm.assert_frame_equal(result, expected) +def _iinfo(dtype): + pdtype = pandas_dtype(dtype) + iinfo = np.iinfo(pdtype.type if is_extension_array_dtype(dtype) else pdtype) + return iinfo + + +@skip_pyarrow +@pytest.mark.parametrize( + "getval", + [ + (lambda dtype: _iinfo(dtype).max), + (lambda dtype: _iinfo(dtype).min), + ], +) +def test_integer_limits_with_user_dtype(all_parsers, any_int_dtype, getval): + dtype = any_int_dtype + parser = all_parsers + val = getval(dtype) + data = f"A\n{val}" + + result = parser.read_csv(StringIO(data), dtype=dtype) + expected_result = DataFrame({"A": [val]}, dtype=dtype) + tm.assert_frame_equal(result, expected_result) + + +@skip_pyarrow +@pytest.mark.parametrize( + "getval", + [ + (lambda dtype: _iinfo(dtype).max + 1), + (lambda dtype: _iinfo(dtype).min - 1), + ], +) +def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval): + # see GH-47167 + dtype = any_int_dtype + parser = all_parsers + val = getval(dtype) + data = f"A\n{val}" + + expected = pytest.raises( # noqa: PDF010 + (OverflowError, TypeError, ValueError), + match="|".join( + [ + "Overflow", + "cannot safely cast non-equivalent", + "Integer out of range", + "Unable to convert column", + "The elements provided in the data cannot all be casted to the dtype", + ] + ), + ) + + # Specific case has intended behavior only after deprecation from #41734 becomes + # enforced. Until then, only expect a FutureWarning. + if ( + (parser.engine == "python") + and (not is_extension_array_dtype(dtype)) + and (dtype < np.dtype("int64")) + and not (is_unsigned_integer_dtype(dtype) and (val < 0)) + ): + expected = tm.assert_produces_warning( + FutureWarning, + match=f"Values are too large to be losslessly cast to {np.dtype(dtype)}.", + check_stacklevel=False, + ) + + with expected: + parser.read_csv(StringIO(data), dtype=dtype) + + +@skip_pyarrow +def test_integer_from_float_lossless(all_parsers, any_int_dtype): + dtype = any_int_dtype + parser = all_parsers + data = "A\n0\n0.0" + + result = parser.read_csv(StringIO(data), dtype=dtype) + expected_result = DataFrame({"A": [0, 0]}, dtype=dtype) + tm.assert_frame_equal(result, expected_result) + + +@skip_pyarrow +def test_integer_from_float_lossy(all_parsers, any_int_dtype): + dtype = any_int_dtype + parser = all_parsers + data = "A\n0\n0.1" + + with pytest.raises((TypeError, ValueError), match=None): + parser.read_csv(StringIO(data), dtype=dtype) + + def test_int64_min_issues(all_parsers): # see gh-2599 parser = all_parsers @@ -170,7 +267,7 @@ def test_int64_overflow(all_parsers, conv): ) def test_int64_uint64_range(all_parsers, val): # These numbers fall right inside the int64-uint64 - # range, so they should be parsed as string. + # range, so they should be parsed as integer value. parser = all_parsers result = parser.read_csv(StringIO(str(val)), header=None) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 61c493a2c368f..78b9770d623dc 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -556,7 +556,7 @@ def test_variable_width_unicode(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype", [{}, {"a": "float64", "b": str, "c": "int32"}]) +@pytest.mark.parametrize("dtype", [{}, {"a": "float64", "b": str, "c": "float16"}]) def test_dtype(dtype): data = """ a b c 1 2 3.2 diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index f150ed3903443..9dd49732aea3b 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -13,8 +13,12 @@ import pandas._libs.parsers as parser from pandas._libs.parsers import TextReader -from pandas import DataFrame +from pandas import ( + DataFrame, + array, +) import pandas._testing as tm +from pandas.api.types import is_extension_array_dtype from pandas.io.parsers import ( TextFileReader, @@ -125,6 +129,24 @@ def test_integer_thousands_alt(self): expected = DataFrame([123456, 12500]) tm.assert_frame_equal(result, expected) + def test_integer_overflow_with_user_dtype(self, any_int_dtype): + dtype = ensure_dtype_objs(any_int_dtype) + is_ext_dtype = is_extension_array_dtype(dtype) + maxint = np.iinfo(dtype.type if is_ext_dtype else dtype).max + + reader = TextReader(StringIO(f"{maxint}"), header=None, dtype=dtype) + result = reader.read() + if is_ext_dtype: + expected = array([maxint], dtype=dtype) + tm.assert_extension_array_equal(result[0], expected) + else: + expected = np.array([maxint], dtype=dtype) + tm.assert_numpy_array_equal(result[0], expected) + + reader = TextReader(StringIO(f"{maxint + 1}"), header=None, dtype=dtype) + with pytest.raises((OverflowError, TypeError, ValueError), match=None): + reader.read() + def test_skip_bad_lines(self, capsys): # too many lines, see #2430 for why data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"