From 0ffc5508a67f3def669c7dc767f9ea550f9103f1 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 27 Dec 2023 18:46:24 -0500 Subject: [PATCH 1/8] floordiv fix for large values --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 4 +++- pandas/tests/extension/test_arrow.py | 9 +++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 5b955aa45219a..05afac69fe137 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -727,6 +727,7 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) +- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes and ``string[pyarrow]`` raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) Conversion diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 23b5448029dd9..977b832025eb2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -113,7 +113,9 @@ def cast_for_truediv( if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( pa_object.type ): - return arrow_array.cast(pa.float64()) + # https://github.com/apache/arrow/issues/35563 + # Arrow does not allow safe casting large integral values to float64. + return arrow_array.cast(pa.float64(), safe=False) return arrow_array def floordiv_compat( diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3b03272f18203..232fb85b6a3b4 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3133,6 +3133,15 @@ def test_arrow_floordiv(): tm.assert_series_equal(result, expected) +def test_arrow_floordiv_large_values(): + # GH 56645 + a = pd.Series([6028797018963968], dtype="int64[pyarrow]") + b = pd.Series([1], dtype="int64[pyarrow]") + expected = pd.Series([6028797018963968], dtype="int64[pyarrow]") + result = a // b + tm.assert_series_equal(result, expected) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] From 2697b11ed37da9ac6ac64f1ed60e565bca360e8a Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 27 Dec 2023 18:47:24 -0500 Subject: [PATCH 2/8] fix --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 05afac69fe137..2fcab46c9e229 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -727,7 +727,7 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) -- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes and ``string[pyarrow]`` raising for large values (:issue:`56645`) +- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) Conversion From bdb734e181444ed0b5501b37fef7c1c81b3a912d Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 27 Dec 2023 18:52:24 -0500 Subject: [PATCH 3/8] exact test --- pandas/tests/extension/test_arrow.py | 53 +++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 232fb85b6a3b4..d5e57243db5d8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3126,10 +3126,55 @@ def test_factorize_chunked_dictionary(): def test_arrow_floordiv(): # GH 55561 - a = pd.Series([-7], dtype="int64[pyarrow]") - b = pd.Series([4], dtype="int64[pyarrow]") - expected = pd.Series([-2], dtype="int64[pyarrow]") - result = a // b + a = pd.Series( + [ + 1425801600000000000, + 1425803400000000000, + 1425805200000000000, + 1425807000000000000, + 1425808800000000000, + 1425801600000000000, + 1425803400000000000, + 1425805200000000000, + 1425807000000000000, + 1446359400000000000, + 1446361200000000000, + 1446363000000000000, + 1446364800000000000, + 1446366600000000000, + 1446364800000000000, + 1446366600000000000, + 1446368400000000000, + 1446370200000000000, + 1446372000000000000, + ], + dtype="int64[pyarrow]", + ) + expected = pd.Series( + [ + 1425801600000, + 1425803400000, + 1425805200000, + 1425807000000, + 1425808800000, + 1425801600000, + 1425803400000, + 1425805200000, + 1425807000000, + 1446359400000, + 1446361200000, + 144636300000, + 144636480000, + 144636660000, + 144636480000, + 144636660000, + 144636840000, + 144637020000, + 1446372000000, + ], + dtype="int64[pyarrow]", + ) + result = a // 1_000_000 tm.assert_series_equal(result, expected) From bcdee56d6509369efcf7738408d8e9464180390b Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 27 Dec 2023 19:02:45 -0500 Subject: [PATCH 4/8] fix --- pandas/tests/extension/test_arrow.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d5e57243db5d8..c4f5c775692ef 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3125,6 +3125,15 @@ def test_factorize_chunked_dictionary(): def test_arrow_floordiv(): + # GH 55561 + a = pd.Series([-7], dtype="int64[pyarrow]") + b = pd.Series([4], dtype="int64[pyarrow]") + expected = pd.Series([-2], dtype="int64[pyarrow]") + result = a // b + tm.assert_series_equal(result, expected) + + +def test_arrow_floordiv_large_values(): # GH 55561 a = pd.Series( [ @@ -3178,15 +3187,6 @@ def test_arrow_floordiv(): tm.assert_series_equal(result, expected) -def test_arrow_floordiv_large_values(): - # GH 56645 - a = pd.Series([6028797018963968], dtype="int64[pyarrow]") - b = pd.Series([1], dtype="int64[pyarrow]") - expected = pd.Series([6028797018963968], dtype="int64[pyarrow]") - result = a // b - tm.assert_series_equal(result, expected) - - def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] From c2b3e99a05feae3eebea3c13052e2b848d152be2 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 27 Dec 2023 19:09:06 -0500 Subject: [PATCH 5/8] fix test --- pandas/tests/extension/test_arrow.py | 44 ++-------------------------- 1 file changed, 2 insertions(+), 42 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c4f5c775692ef..8447c2e40e8d5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3136,51 +3136,11 @@ def test_arrow_floordiv(): def test_arrow_floordiv_large_values(): # GH 55561 a = pd.Series( - [ - 1425801600000000000, - 1425803400000000000, - 1425805200000000000, - 1425807000000000000, - 1425808800000000000, - 1425801600000000000, - 1425803400000000000, - 1425805200000000000, - 1425807000000000000, - 1446359400000000000, - 1446361200000000000, - 1446363000000000000, - 1446364800000000000, - 1446366600000000000, - 1446364800000000000, - 1446366600000000000, - 1446368400000000000, - 1446370200000000000, - 1446372000000000000, - ], + [1425801600000000000], dtype="int64[pyarrow]", ) expected = pd.Series( - [ - 1425801600000, - 1425803400000, - 1425805200000, - 1425807000000, - 1425808800000, - 1425801600000, - 1425803400000, - 1425805200000, - 1425807000000, - 1446359400000, - 1446361200000, - 144636300000, - 144636480000, - 144636660000, - 144636480000, - 144636660000, - 144636840000, - 144637020000, - 1446372000000, - ], + [1425801600000], dtype="int64[pyarrow]", ) result = a // 1_000_000 From 6b557b8fcab80a43791568aa52f3863f303aa3b5 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 27 Dec 2023 19:11:07 -0500 Subject: [PATCH 6/8] test --- pandas/tests/extension/test_arrow.py | 44 ++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 8447c2e40e8d5..012ea5132caca 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3136,11 +3136,51 @@ def test_arrow_floordiv(): def test_arrow_floordiv_large_values(): # GH 55561 a = pd.Series( - [1425801600000000000], + [ + 1425801600000000000, + 1425803400000000000, + 1425805200000000000, + 1425807000000000000, + 1425808800000000000, + 1425801600000000000, + 1425803400000000000, + 1425805200000000000, + 1425807000000000000, + 1446359400000000000, + 1446361200000000000, + 1446363000000000000, + 1446364800000000000, + 1446366600000000000, + 1446364800000000000, + 1446366600000000000, + 1446368400000000000, + 1446370200000000000, + 1446372000000000000, + ], dtype="int64[pyarrow]", ) expected = pd.Series( - [1425801600000], + [ + 1425801600000, + 1425803400000, + 1425805200000, + 1425807000000, + 1425808800000, + 1425801600000, + 1425803400000, + 1425805200000, + 1425807000000, + 1446359400000, + 1446361200000, + 1446363000000, + 1446364800000, + 1446366600000, + 1446364800000, + 1446366600000, + 1446368400000, + 1446370200000, + 1446372000000, + ], dtype="int64[pyarrow]", ) result = a // 1_000_000 From e6c7930c5a7e2561c60a83a7c50c3b57b51a568e Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 27 Dec 2023 19:45:04 -0500 Subject: [PATCH 7/8] min version fix --- pandas/core/arrays/arrow/array.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 977b832025eb2..5d4af24221086 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -115,7 +115,10 @@ def cast_for_truediv( ): # https://github.com/apache/arrow/issues/35563 # Arrow does not allow safe casting large integral values to float64. - return arrow_array.cast(pa.float64(), safe=False) + # Intentionally not using arrow_array.cast because it could be a scalar + # value in reflected case, and safe=False only added to + # scalar cast in pyarrow 13. + return pc.cast(arrow_array, pa.float64(), safe=False) return arrow_array def floordiv_compat( From 08d213d2cb95777bbdd1da7a541bbb52dcd718a6 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 27 Dec 2023 20:30:22 -0500 Subject: [PATCH 8/8] minimal repro --- pandas/tests/extension/test_arrow.py | 50 ++-------------------------- 1 file changed, 2 insertions(+), 48 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 012ea5132caca..1ade1d398a4dd 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3135,54 +3135,8 @@ def test_arrow_floordiv(): def test_arrow_floordiv_large_values(): # GH 55561 - a = pd.Series( - [ - 1425801600000000000, - 1425803400000000000, - 1425805200000000000, - 1425807000000000000, - 1425808800000000000, - 1425801600000000000, - 1425803400000000000, - 1425805200000000000, - 1425807000000000000, - 1446359400000000000, - 1446361200000000000, - 1446363000000000000, - 1446364800000000000, - 1446366600000000000, - 1446364800000000000, - 1446366600000000000, - 1446368400000000000, - 1446370200000000000, - 1446372000000000000, - ], - dtype="int64[pyarrow]", - ) - expected = pd.Series( - [ - 1425801600000, - 1425803400000, - 1425805200000, - 1425807000000, - 1425808800000, - 1425801600000, - 1425803400000, - 1425805200000, - 1425807000000, - 1446359400000, - 1446361200000, - 1446363000000, - 1446364800000, - 1446366600000, - 1446364800000, - 1446366600000, - 1446368400000, - 1446370200000, - 1446372000000, - ], - dtype="int64[pyarrow]", - ) + a = pd.Series([1425801600000000000], dtype="int64[pyarrow]") + expected = pd.Series([1425801600000], dtype="int64[pyarrow]") result = a // 1_000_000 tm.assert_series_equal(result, expected)