pandas-dev · SandroCasagrande · May 30, 2022 · May 30, 2022 · May 30, 2022 · May 30, 2022
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -285,6 +285,15 @@ def time_read_uint64_na_values(self):
         )
 
 
+class ReadUint8Integers(StringIORewind):
+    def setup(self):
+        arr = np.arange(10000).astype("uint8")
+        self.data1 = StringIO("\n".join(arr.astype(str).tolist()))
+
+    def time_read_uint8(self):
+        read_csv(self.data(self.data1), header=None, names=["foo"], dtype="uint8")
+
+
 class ReadCSVThousands(BaseIO):
 
     fname = "__test__.csv"
@@ -567,7 +576,7 @@ def setup(self):
         self.StringIO_input = StringIO(data)
 
     def time_read_csv_index_col(self):
-        read_csv(self.StringIO_input, index_col="a")
+        read_csv(self.data(self.StringIO_input), index_col="a")
 
 
 from ..pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -820,6 +820,7 @@ I/O
 - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
 - :meth:`to_html` now excludes the ``border`` attribute from ``<table>`` elements when ``border`` keyword is set to ``False``.
 - Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`)
+- Bug in :meth:`TextReader.read` with specified (non-extension) integer ``dtype`` can cause silent overflow or unexpected return dtype (:issue:`47167`)
 -
 
 Period

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1176,19 +1176,34 @@ cdef class TextReader:
             return result, na_count
 
         elif is_integer_dtype(dtype):
-            try:
-                result, na_count = _try_int64(self.parser, i, start,
-                                              end, na_filter, na_hashset)
-                if user_dtype and na_count is not None:
-                    if na_count > 0:
-                        raise ValueError(f"Integer column has NA values in column {i}")
-            except OverflowError:
+            do_try_uint64 = False
+            if user_dtype and dtype == 'uint64':
+                do_try_uint64 = True
+            else:
+                try:
+                    result, na_count = _try_int64(self.parser, i, start,
+                                                  end, na_filter, na_hashset)
+                    if user_dtype and na_count is not None:
+                        if na_count > 0:
+                            raise ValueError(f"Integer column has NA values in column {i}")
+                except OverflowError as err:
+                    if user_dtype and dtype == 'int64':
+                        raise err
+                    do_try_uint64 = True
+
+            if do_try_uint64:
                 result = _try_uint64(self.parser, i, start, end,
                                      na_filter, na_hashset)
                 na_count = 0
 
-            if result is not None and dtype != 'int64':
-                result = result.astype(dtype)
+            if result is not None and dtype not in ('int64', 'uint64'):
+                casted = result.astype(dtype)
+                if (casted == result).all():
 casted = values.astype(dtype, copy=copy) 
 if (casted == values).all(): 
     return casted 
 casted = values.astype(dtype, copy=copy) 
 if (casted == values).all(): 
     return casted 
+                    result = casted
+                else:
+                    raise TypeError(
+                        f"cannot safely cast non-equivalent {result.dtype} to {dtype}"
+                    )
 
             return result, na_count
 

diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py
@@ -13,8 +13,12 @@
 import pandas._libs.parsers as parser
 from pandas._libs.parsers import TextReader
 
-from pandas import DataFrame
+from pandas import (
+    DataFrame,
+    array,
+)
 import pandas._testing as tm
+from pandas.api.types import is_extension_array_dtype
 
 from pandas.io.parsers import (
     TextFileReader,
@@ -125,6 +129,45 @@ def test_integer_thousands_alt(self):
         expected = DataFrame([123456, 12500])
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            "uint64",
+            "int64",
+            "uint32",
+            "int32",
+            "uint16",
+            "int16",
+            "uint8",
+            "int8",
+            "UInt64",
+            "Int64",
+            "UInt32",
+            "Int32",
+            "UInt16",
+            "Int16",
+            "UInt8",
+            "Int8",
+        ],
+    )
+    def test_integer_overflow_with_user_dtype(self, dtype):
+        dtype = ensure_dtype_objs(dtype)
+        is_ext_dtype = is_extension_array_dtype(dtype)
+        maxint = np.iinfo(dtype.type if is_ext_dtype else dtype).max
+
+        reader = TextReader(StringIO(f"{maxint}"), header=None, dtype=dtype)
+        result = reader.read()
+        if is_ext_dtype:
+            expected = array([maxint], dtype=dtype)
+            tm.assert_extension_array_equal(result[0], expected)
+        else:
+            expected = np.array([maxint], dtype=dtype)
+            tm.assert_numpy_array_equal(result[0], expected)
+
+        reader = TextReader(StringIO(f"{maxint + 1}"), header=None, dtype=dtype)
+        with pytest.raises(Exception, match=""):
+            reader.read()
+
     def test_skip_bad_lines(self, capsys):
         # too many lines, see #2430 for why
         data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"