BUG: read_csv overflowing for ea int with nulls (#50847)

phofl · web-flow · commit c1ce0a7d3e6b · 2023-01-19T10:04:50.000-08:00
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -1023,6 +1023,7 @@ I/O
 - Bug in :meth:`DataFrame.to_string` ignoring float formatter for extension arrays (:issue:`39336`)
 - Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
 - Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
+- Bug in :func:`read_csv` unnecessarily overflowing for extension array dtype when containing ``NA`` (:issue:`32134`)
 - Bug in :meth:`DataFrame.to_dict` not converting ``NA`` to ``None`` (:issue:`50795`)
 - Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
 - Bug in :func:`read_xml` where file-like objects failed when iterparse is used (:issue:`50641`)
diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py
@@ -285,7 +285,7 @@ def _from_sequence_of_strings(
     ) -> T:
         from pandas.core.tools.numeric import to_numeric
 
-        scalars = to_numeric(strings, errors="raise")
+        scalars = to_numeric(strings, errors="raise", use_nullable_dtypes=True)
         return cls._from_sequence(scalars, dtype=dtype, copy=copy)
 
     _HANDLED_TYPES = (np.ndarray, numbers.Number)
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -18,6 +18,7 @@
 import pandas._testing as tm
 from pandas.core.arrays import (
     ArrowStringArray,
+    IntegerArray,
     StringArray,
 )
 
@@ -527,3 +528,23 @@ def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request):
             }
         )
     tm.assert_frame_equal(result, expected)
+
+
+def test_ea_int_avoid_overflow(all_parsers):
+    # GH#32134
+    parser = all_parsers
+    data = """a,b
+1,1
+,1
+1582218195625938945,1
+"""
+    result = parser.read_csv(StringIO(data), dtype={"a": "Int64"})
+    expected = DataFrame(
+        {
+            "a": IntegerArray(
+                np.array([1, 1, 1582218195625938945]), np.array([False, True, False])
+            ),
+            "b": 1,
+        }
+    )
+    tm.assert_frame_equal(result, expected)