From a7d88385636e5220b485a808860bb406ceed5ad7 Mon Sep 17 00:00:00 2001 From: Kevin Date: Mon, 21 Aug 2023 16:19:38 -0700 Subject: [PATCH 1/4] mend --- pandas/io/parsers/arrow_parser_wrapper.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 71bfb00a95b50..ffd48b4a0cede 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -223,5 +223,10 @@ def read(self) -> DataFrame: elif using_pyarrow_string_dtype(): frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: - frame = table.to_pandas() + if self.kwds.get("dtype") is not None and issubclass( + type(self.kwds.get("dtype")), dict + ): + frame = table.to_pandas(types_mapper=self.kwds["dtype"].get) + else: + frame = table.to_pandas() return self._finalize_pandas_output(frame) From 4429c6a9985793b6e02b77c333d7a82726e81082 Mon Sep 17 00:00:00 2001 From: Kevin Date: Tue, 22 Aug 2023 18:33:09 -0700 Subject: [PATCH 2/4] mmend --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/io/parsers/arrow_parser_wrapper.py | 2 +- .../io/parser/dtypes/test_dtypes_basic.py | 21 +++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 6fdffb4d78341..de89a04842c12 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -139,7 +139,7 @@ Timezones Numeric ^^^^^^^ -- +- Bug in :func:`_read`, pyarrow engine not defaulting to float64 causing precision errors when specifying a dtype; fixed by explicitly setting dtype if dtype not none and isinstance of dict (:issue:`52505`) - Conversion diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index ffd48b4a0cede..95294e2e3726f 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -223,7 +223,7 @@ def read(self) -> DataFrame: elif using_pyarrow_string_dtype(): frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: - if self.kwds.get("dtype") is not None and issubclass( + if self.kwds.get("dtype") is not None and isinstance( type(self.kwds.get("dtype")), dict ): frame = table.to_pandas(types_mapper=self.kwds["dtype"].get) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 1c0f0939029ff..8411129a93bbf 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -558,3 +558,24 @@ def test_string_inference(all_parsers): columns=pd.Index(["a", "b"], dtype=dtype), ) tm.assert_frame_equal(result, expected) + + +def test_accurate_parsing_of_large_integers(all_parsers): + # GH#52505 + data = """SYMBOL,SYSTEM,TYPE,MOMENT,ID,ACTION,PRICE,VOLUME,ID_DEAL,PRICE_DEAL +AAPL,F,S,20230301181139587,1925036343869802844,0,96690.00000,2,,75.00000 +MSFT,F,S,20230301181139587,2023552585717888193,0,75.10000,14,, +TSLA,F,S,20230301181139587,2023552585717889863,1,75.00000,14,, +AAPL,F,S,20230301181139587,2023552585717889863,2,75.00000,1,2023552585717263358,75.00000 +TSLA,F,B,20230301181139587,2023552585717882895,2,75.00000,1,2023552585717263358,75.00000 +NVDA,F,S,20230301181139587,2023552585717889863,2,75.00000,1,2023552585717263359,75.00000 +MRNA,F,B,20230301181139587,2023552585717888161,2,75.00000,1,2023552585717263359,75.00000 +AMC,F,S,20230301181139587,2023552585717889863,2,75.00000,10,2023552585717263360,75.00000 +AMZN,F,B,20230301181139587,2023552585717889759,2,75.00000,10,2023552585717263360,75.00000 +MSFT,F,S,20230301181139587,2023552585717889863,2,75.00000,2,2023552585717263361,75.00000 +NVDA,F,B,20230301181139587,2023552585717889827,2,75.00000,2,2023552585717263361,75.00000""" + orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + print(len(orders.query("ID_DEAL==2023552585717263360", engine="python"))) + tm.assert_equal( + len(orders.query("ID_DEAL==2023552585717263360", engine="python")), 2 + ) From 6f2375181cf9e0c224eead6e6b32d2c79f1f7d6e Mon Sep 17 00:00:00 2001 From: Kevin Date: Wed, 23 Aug 2023 11:16:55 -0700 Subject: [PATCH 3/4] mmend --- doc/source/whatsnew/v2.2.0.rst | 2 +- .../io/parser/dtypes/test_dtypes_basic.py | 28 ++++++++----------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index de89a04842c12..593930745b2ec 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -139,7 +139,7 @@ Timezones Numeric ^^^^^^^ -- Bug in :func:`_read`, pyarrow engine not defaulting to float64 causing precision errors when specifying a dtype; fixed by explicitly setting dtype if dtype not none and isinstance of dict (:issue:`52505`) +- Bug in :func:`_read`, pyarrow engine defaulting to float64 causing rounding errors for large integers; now processes input appropriately (:issue:`52505`) - Conversion diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 8411129a93bbf..f797f6392d56c 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -562,20 +562,16 @@ def test_string_inference(all_parsers): def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 - data = """SYMBOL,SYSTEM,TYPE,MOMENT,ID,ACTION,PRICE,VOLUME,ID_DEAL,PRICE_DEAL -AAPL,F,S,20230301181139587,1925036343869802844,0,96690.00000,2,,75.00000 -MSFT,F,S,20230301181139587,2023552585717888193,0,75.10000,14,, -TSLA,F,S,20230301181139587,2023552585717889863,1,75.00000,14,, -AAPL,F,S,20230301181139587,2023552585717889863,2,75.00000,1,2023552585717263358,75.00000 -TSLA,F,B,20230301181139587,2023552585717882895,2,75.00000,1,2023552585717263358,75.00000 -NVDA,F,S,20230301181139587,2023552585717889863,2,75.00000,1,2023552585717263359,75.00000 -MRNA,F,B,20230301181139587,2023552585717888161,2,75.00000,1,2023552585717263359,75.00000 -AMC,F,S,20230301181139587,2023552585717889863,2,75.00000,10,2023552585717263360,75.00000 -AMZN,F,B,20230301181139587,2023552585717889759,2,75.00000,10,2023552585717263360,75.00000 -MSFT,F,S,20230301181139587,2023552585717889863,2,75.00000,2,2023552585717263361,75.00000 -NVDA,F,B,20230301181139587,2023552585717889827,2,75.00000,2,2023552585717263361,75.00000""" + data = """SYMBOL,MOMENT,ID,ID_DEAL +AAPL,20230301181139587,1925036343869802844, +AAPL,20230301181139587,2023552585717889863,2023552585717263358 +NVDA,20230301181139587,2023552585717889863,2023552585717263359 +AMC,20230301181139587,2023552585717889863,2023552585717263360 +AMZN,20230301181139587,2023552585717889759,2023552585717263360 +MSFT,20230301181139587,2023552585717889863,2023552585717263361 +NVDA,20230301181139587,2023552585717889827,2023552585717263361""" orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) - print(len(orders.query("ID_DEAL==2023552585717263360", engine="python"))) - tm.assert_equal( - len(orders.query("ID_DEAL==2023552585717263360", engine="python")), 2 - ) + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263361, "ID_DEAL"]) == 2 From e2d51c43223adb19f7101781183743e374d64b70 Mon Sep 17 00:00:00 2001 From: Kevin Date: Thu, 24 Aug 2023 02:27:21 -0700 Subject: [PATCH 4/4] mmend --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/io/parsers/arrow_parser_wrapper.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 593930745b2ec..138be2457d718 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -139,7 +139,7 @@ Timezones Numeric ^^^^^^^ -- Bug in :func:`_read`, pyarrow engine defaulting to float64 causing rounding errors for large integers; now processes input appropriately (:issue:`52505`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) - Conversion diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 95294e2e3726f..bb6bcd3c4d6a0 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -223,9 +223,7 @@ def read(self) -> DataFrame: elif using_pyarrow_string_dtype(): frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: - if self.kwds.get("dtype") is not None and isinstance( - type(self.kwds.get("dtype")), dict - ): + if isinstance(self.kwds.get("dtype"), dict): frame = table.to_pandas(types_mapper=self.kwds["dtype"].get) else: frame = table.to_pandas()