Incorrect reading of CSV containing large integers Issue#52505 (#54679)

kvn4 · web-flow · commit 766e2fc7ceb9 · 2023-08-24T09:25:53.000-07:00
* mend

* mmend

* mmend

* mmend
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -185,7 +185,7 @@ Timezones
 
 Numeric
 ^^^^^^^
--
+- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`)
 -
 
 Conversion
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -223,5 +223,8 @@ def read(self) -> DataFrame:
         elif using_pyarrow_string_dtype():
             frame = table.to_pandas(types_mapper=arrow_string_types_mapper())
         else:
-            frame = table.to_pandas()
+            if isinstance(self.kwds.get("dtype"), dict):
+                frame = table.to_pandas(types_mapper=self.kwds["dtype"].get)
+            else:
+                frame = table.to_pandas()
         return self._finalize_pandas_output(frame)
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -558,3 +558,20 @@ def test_string_inference(all_parsers):
         columns=pd.Index(["a", "b"], dtype=dtype),
     )
     tm.assert_frame_equal(result, expected)
+
+
+def test_accurate_parsing_of_large_integers(all_parsers):
+    # GH#52505
+    data = """SYMBOL,MOMENT,ID,ID_DEAL
+AAPL,20230301181139587,1925036343869802844,
+AAPL,20230301181139587,2023552585717889863,2023552585717263358
+NVDA,20230301181139587,2023552585717889863,2023552585717263359
+AMC,20230301181139587,2023552585717889863,2023552585717263360
+AMZN,20230301181139587,2023552585717889759,2023552585717263360
+MSFT,20230301181139587,2023552585717889863,2023552585717263361
+NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
+    orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
+    assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
+    assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
+    assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
+    assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263361, "ID_DEAL"]) == 2

Original file line number	Diff line number	Diff line change
`@@ -185,7 +185,7 @@ Timezones`
`185`	`185`
`186`	`186`	`Numeric`
`187`	`187`	`^^^^^^^`
`188`		`--`
	`188`	+- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`)
`189`	`189`	`-`
`190`	`190`
`191`	`191`	`Conversion`