pandas-dev · SandroCasagrande · May 30, 2022 · May 30, 2022 · May 30, 2022 · May 30, 2022
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -285,6 +285,15 @@ def time_read_uint64_na_values(self):
         )
 
 
+class ReadUint8Integers(StringIORewind):
+    def setup(self):
+        arr = np.tile(np.arange(256, dtype="uint8"), 50)
+        self.data1 = StringIO("\n".join(arr.astype(str).tolist()))
+
+    def time_read_uint8(self):
+        read_csv(self.data(self.data1), header=None, names=["foo"], dtype="uint8")
+
+
 class ReadCSVThousands(BaseIO):
 
     fname = "__test__.csv"
@@ -567,7 +576,7 @@ def setup(self):
         self.StringIO_input = StringIO(data)
 
     def time_read_csv_index_col(self):
-        read_csv(self.StringIO_input, index_col="a")
+        read_csv(self.data(self.StringIO_input), index_col="a")
 
 
 from ..pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -1136,6 +1136,8 @@ I/O
 - Bug in :func:`read_parquet` with ``use_nullable_dtypes=True`` where ``float64`` dtype was returned instead of nullable ``Float64`` dtype (:issue:`45694`)
 - Bug in :meth:`DataFrame.to_json` where ``PeriodDtype`` would not make the serialization roundtrip when read back with :meth:`read_json` (:issue:`44720`)
 - Bug in :func:`read_xml` when reading XML files with Chinese character tags and would raise ``XMLSyntaxError`` (:issue:`47902`)
+- Bug in :func:`read_csv` with specified numpy integer ``dtype`` can cause silent overflow or unexpected return dtype (:issue:`47167`)
+- Bug in :func:`read_csv` with specified numpy integer ``dtype`` and ``engine="python"`` can cause silent lossy float coercion (:issue:`47167`)
 
 Period
 ^^^^^^

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1189,19 +1189,32 @@ cdef class TextReader:
             return result, na_count
 
         elif is_integer_dtype(dtype):
-            try:
-                result, na_count = _try_int64(self.parser, i, start,
-                                              end, na_filter, na_hashset)
-                if user_dtype and na_count is not None:
-                    if na_count > 0:
-                        raise ValueError(f"Integer column has NA values in column {i}")
-            except OverflowError:
-                result = _try_uint64(self.parser, i, start, end,
-                                     na_filter, na_hashset)
+            if user_dtype and dtype == "uint64":
+                result = _try_uint64(self.parser, i, start,
+                                     end, na_filter, na_hashset)
                 na_count = 0
+            else:
+                try:
+                    result, na_count = _try_int64(self.parser, i, start,
+                                                  end, na_filter, na_hashset)
+                except OverflowError as err:
+                    if user_dtype and dtype == "int64":
+                        raise err
+                    result = _try_uint64(self.parser, i, start,
+                                         end, na_filter, na_hashset)
+                    na_count = 0
+                else:
+                    if user_dtype and (na_count is not None) and (na_count > 0):
+                        raise ValueError(f"Integer column has NA values in column {i}")
 
-            if result is not None and dtype != "int64":
-                result = result.astype(dtype)
+            if result is not None and dtype not in ("int64", "uint64"):
+                casted = result.astype(dtype)
+                if (casted == result).all():
 casted = values.astype(dtype, copy=copy) 
 if (casted == values).all(): 
     return casted 
 casted = values.astype(dtype, copy=copy) 
 if (casted == values).all(): 
     return casted 
+                    result = casted
+                else:
+                    raise TypeError(
+                        f"cannot safely cast non-equivalent {result.dtype} to {dtype}"
+                    )
 
             return result, na_count
 

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -49,6 +49,7 @@
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.astype import astype_nansafe
+from pandas.core.dtypes.cast import maybe_cast_to_integer_array
 from pandas.core.dtypes.common import (
     ensure_object,
     is_bool_dtype,
@@ -844,8 +845,11 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi
             values = values.astype(cast_type, copy=False)
         else:
             try:
-                values = astype_nansafe(values, cast_type, copy=True, skipna=True)
-            except ValueError as err:
+                if is_integer_dtype(cast_type):
+                    values = maybe_cast_to_integer_array(values, cast_type, copy=True)
+                else:
+                    values = astype_nansafe(values, cast_type, copy=True, skipna=True)
+            except (ValueError, OverflowError) as err:
                 raise ValueError(
                     f"Unable to convert column {column} to type {cast_type}"
                 ) from err

diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py
@@ -12,6 +12,11 @@
     Series,
 )
 import pandas._testing as tm
+from pandas.api.types import (
+    is_extension_array_dtype,
+    is_unsigned_integer_dtype,
+    pandas_dtype,
+)
 
 # GH#43650: Some expected failures with the pyarrow engine can occasionally
 # cause a deadlock instead, so we skip these instead of xfailing
@@ -110,6 +115,98 @@ def test_integer_overflow_bug(all_parsers, sep):
     tm.assert_frame_equal(result, expected)
 
 
+def _iinfo(dtype):
+    pdtype = pandas_dtype(dtype)
+    iinfo = np.iinfo(pdtype.type if is_extension_array_dtype(dtype) else pdtype)
+    return iinfo
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "getval",
+    [
+        (lambda dtype: _iinfo(dtype).max),
+        (lambda dtype: _iinfo(dtype).min),
+    ],
+)
+def test_integer_limits_with_user_dtype(all_parsers, any_int_dtype, getval):
+    dtype = any_int_dtype
+    parser = all_parsers
+    val = getval(dtype)
+    data = f"A\n{val}"
+
+    result = parser.read_csv(StringIO(data), dtype=dtype)
+    expected_result = DataFrame({"A": [val]}, dtype=dtype)
+    tm.assert_frame_equal(result, expected_result)
+
+
+@skip_pyarrow
+@pytest.mark.parametrize(
+    "getval",
+    [
+        (lambda dtype: _iinfo(dtype).max + 1),
+        (lambda dtype: _iinfo(dtype).min - 1),
+    ],
+)
+def test_integer_overflow_with_user_dtype(all_parsers, any_int_dtype, getval):
+    # see GH-47167
+    dtype = any_int_dtype
+    parser = all_parsers
+    val = getval(dtype)
+    data = f"A\n{val}"
+
+    expected = pytest.raises(  # noqa: PDF010
+        (OverflowError, TypeError, ValueError),
+        match="|".join(
+            [
+                "Overflow",
+                "cannot safely cast non-equivalent",
+                "Integer out of range",
+                "Unable to convert column",
+                "The elements provided in the data cannot all be casted to the dtype",
+            ]
+        ),
+    )
+
+    # Specific case has intended behavior only after deprecation from #41734 becomes
+    # enforced. Until then, only expect a FutureWarning.
+    if (
+        (parser.engine == "python")
+        and (not is_extension_array_dtype(dtype))
+        and (dtype < np.dtype("int64"))
+        and not (is_unsigned_integer_dtype(dtype) and (val < 0))
+    ):
+        expected = tm.assert_produces_warning(
+            FutureWarning,
+            match=f"Values are too large to be losslessly cast to {np.dtype(dtype)}.",
+            check_stacklevel=False,
+        )
+
+    with expected:
+        parser.read_csv(StringIO(data), dtype=dtype)
+
+
+@skip_pyarrow
+def test_integer_from_float_lossless(all_parsers, any_int_dtype):
+    dtype = any_int_dtype
+    parser = all_parsers
+    data = "A\n0\n0.0"
+
+    result = parser.read_csv(StringIO(data), dtype=dtype)
+    expected_result = DataFrame({"A": [0, 0]}, dtype=dtype)
+    tm.assert_frame_equal(result, expected_result)
+
+
+@skip_pyarrow
+def test_integer_from_float_lossy(all_parsers, any_int_dtype):
+    dtype = any_int_dtype
+    parser = all_parsers
+    data = "A\n0\n0.1"
+
+    with pytest.raises((TypeError, ValueError), match=None):
+        parser.read_csv(StringIO(data), dtype=dtype)
+
+
 def test_int64_min_issues(all_parsers):
     # see gh-2599
     parser = all_parsers
@@ -170,7 +267,7 @@ def test_int64_overflow(all_parsers, conv):
 )
 def test_int64_uint64_range(all_parsers, val):
     # These numbers fall right inside the int64-uint64
-    # range, so they should be parsed as string.
+    # range, so they should be parsed as integer value.
     parser = all_parsers
     result = parser.read_csv(StringIO(str(val)), header=None)
 

diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
@@ -556,7 +556,7 @@ def test_variable_width_unicode():
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.parametrize("dtype", [{}, {"a": "float64", "b": str, "c": "int32"}])
+@pytest.mark.parametrize("dtype", [{}, {"a": "float64", "b": str, "c": "float16"}])
 def test_dtype(dtype):
     data = """ a    b    c
 1    2    3.2

diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py
@@ -13,8 +13,12 @@
 import pandas._libs.parsers as parser
 from pandas._libs.parsers import TextReader
 
-from pandas import DataFrame
+from pandas import (
+    DataFrame,
+    array,
+)
 import pandas._testing as tm
+from pandas.api.types import is_extension_array_dtype
 
 from pandas.io.parsers import (
     TextFileReader,
@@ -125,6 +129,24 @@ def test_integer_thousands_alt(self):
         expected = DataFrame([123456, 12500])
         tm.assert_frame_equal(result, expected)
 
+    def test_integer_overflow_with_user_dtype(self, any_int_dtype):
+        dtype = ensure_dtype_objs(any_int_dtype)
+        is_ext_dtype = is_extension_array_dtype(dtype)
+        maxint = np.iinfo(dtype.type if is_ext_dtype else dtype).max
+
+        reader = TextReader(StringIO(f"{maxint}"), header=None, dtype=dtype)
+        result = reader.read()
+        if is_ext_dtype:
+            expected = array([maxint], dtype=dtype)
+            tm.assert_extension_array_equal(result[0], expected)
+        else:
+            expected = np.array([maxint], dtype=dtype)
+            tm.assert_numpy_array_equal(result[0], expected)
+
+        reader = TextReader(StringIO(f"{maxint + 1}"), header=None, dtype=dtype)
+        with pytest.raises((OverflowError, TypeError, ValueError), match=None):
+            reader.read()
+
     def test_skip_bad_lines(self, capsys):
         # too many lines, see #2430 for why
         data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"