From aad3b2e16c1e8b71763cbe7514a81611cceead53 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 28 Dec 2023 23:22:19 -0500 Subject: [PATCH 01/25] avoid floating points for integral floor division --- pandas/core/arrays/arrow/array.py | 20 +++++++++++++++----- pandas/tests/extension/test_arrow.py | 8 ++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b1164301e6d79..e9e06cabbee9c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -127,12 +127,22 @@ def floordiv_compat( left: pa.ChunkedArray | pa.Array | pa.Scalar, right: pa.ChunkedArray | pa.Array | pa.Scalar, ) -> pa.ChunkedArray: - # Ensure int // int -> int mirroring Python/Numpy behavior - # as pc.floor(pc.divide_checked(int, int)) -> float - converted_left = cast_for_truediv(left, right) - result = pc.floor(pc.divide(converted_left, right)) - if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): + divided = pc.divide(left, right) + if pa.types.is_integer(divided.type): + has_remainder = pc.not_equal(pc.multiply(divided, right), left) + result = pc.if_else( + pc.less(divided, 0), + pc.if_else(has_remainder, pc.subtract(divided, 1), divided), + divided, + ) + # Ensure compatibility with older versions of pandas where + # int8 // int64 returned int8 rather than int64. result = result.cast(left.type) + else: + assert pa.types.is_floating(divided.type) or pa.types.is_decimal( + divided.type + ) + result = pc.floor(divided) return result ARROW_ARITHMETIC_FUNCS = { diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ed1b7b199a16f..1a9caa9824448 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3246,6 +3246,14 @@ def test_arrow_floordiv_large_values(): tm.assert_series_equal(result, expected) +def test_arrow_floordiv_large_integral_result(): + # GH XXXXX + a = pd.Series([18014398509481983], dtype="int64[pyarrow]") + expected = pd.Series([18014398509481983], dtype="int64[pyarrow]") + result = a // 1 + tm.assert_series_equal(result, expected) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] From 599420bdd991cd45ae43260beb90e4d78af255b2 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 28 Dec 2023 23:24:03 -0500 Subject: [PATCH 02/25] comment --- pandas/core/arrays/arrow/array.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e9e06cabbee9c..91395d87a92eb 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -132,6 +132,9 @@ def floordiv_compat( has_remainder = pc.not_equal(pc.multiply(divided, right), left) result = pc.if_else( pc.less(divided, 0), + # Avoid using subtract_checked which has significantly worse perf: + # https://github.com/apache/arrow/issues/37678, especially since + # integer overflow should be impossible here/ pc.if_else(has_remainder, pc.subtract(divided, 1), divided), divided, ) From 2f43c4204a7f9f72ec3285dcd682e1fa6be75802 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 28 Dec 2023 23:25:57 -0500 Subject: [PATCH 03/25] typo --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 91395d87a92eb..ebb4c2ce6cf87 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -134,7 +134,7 @@ def floordiv_compat( pc.less(divided, 0), # Avoid using subtract_checked which has significantly worse perf: # https://github.com/apache/arrow/issues/37678, especially since - # integer overflow should be impossible here/ + # integer overflow should be impossible here. pc.if_else(has_remainder, pc.subtract(divided, 1), divided), divided, ) From 579d4ca4cc20355d23d44b9d8787689beb09a41e Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 28 Dec 2023 23:28:35 -0500 Subject: [PATCH 04/25] gh reference --- pandas/core/arrays/arrow/array.py | 1 + pandas/tests/extension/test_arrow.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index ebb4c2ce6cf87..265eafa8d760c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -129,6 +129,7 @@ def floordiv_compat( ) -> pa.ChunkedArray: divided = pc.divide(left, right) if pa.types.is_integer(divided.type): + # GH 56676, avoid storing intermediate calculating in floating point type. has_remainder = pc.not_equal(pc.multiply(divided, right), left) result = pc.if_else( pc.less(divided, 0), diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1a9caa9824448..a48c6663a9930 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3247,7 +3247,7 @@ def test_arrow_floordiv_large_values(): def test_arrow_floordiv_large_integral_result(): - # GH XXXXX + # GH 56676 a = pd.Series([18014398509481983], dtype="int64[pyarrow]") expected = pd.Series([18014398509481983], dtype="int64[pyarrow]") result = a // 1 From e8267909830ab4f516730fb101aeec6c7d69e553 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 28 Dec 2023 23:37:22 -0500 Subject: [PATCH 05/25] improve test --- pandas/tests/extension/test_arrow.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a48c6663a9930..c223f1a3b9478 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3246,10 +3246,14 @@ def test_arrow_floordiv_large_values(): tm.assert_series_equal(result, expected) -def test_arrow_floordiv_large_integral_result(): +@pytest.mark.parametrize( + "value", + [18014398509481983, -9223372036854775808], +) +def test_arrow_floordiv_large_integral_result(value): # GH 56676 - a = pd.Series([18014398509481983], dtype="int64[pyarrow]") - expected = pd.Series([18014398509481983], dtype="int64[pyarrow]") + a = pd.Series([value], dtype="int64[pyarrow]") + expected = pd.Series([value], dtype="int64[pyarrow]") result = a // 1 tm.assert_series_equal(result, expected) From bfcff0be8399dbcc5f6931afdb48d4a2de8a926e Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 28 Dec 2023 23:43:31 -0500 Subject: [PATCH 06/25] fix comment --- pandas/core/arrays/arrow/array.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 265eafa8d760c..b27460c8115dc 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -135,7 +135,9 @@ def floordiv_compat( pc.less(divided, 0), # Avoid using subtract_checked which has significantly worse perf: # https://github.com/apache/arrow/issues/37678, especially since - # integer overflow should be impossible here. + # integer overflow should be impossible here. Using subtract_checked + # would also incorrectly raise for -9223372036854775808 // 1, + # if integer overflow occurs, then has_remainder is false. pc.if_else(has_remainder, pc.subtract(divided, 1), divided), divided, ) From cd7c4c72ccc5eb074b800d9fb5db51498f125076 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 28 Dec 2023 23:47:21 -0500 Subject: [PATCH 07/25] cleanup --- pandas/tests/extension/test_arrow.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c223f1a3b9478..ea32a1d567a71 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3239,21 +3239,19 @@ def test_arrow_floordiv(): def test_arrow_floordiv_large_values(): - # GH 55561 + # GH 56645 a = pd.Series([1425801600000000000], dtype="int64[pyarrow]") expected = pd.Series([1425801600000], dtype="int64[pyarrow]") result = a // 1_000_000 tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "value", - [18014398509481983, -9223372036854775808], -) -def test_arrow_floordiv_large_integral_result(value): +def test_arrow_floordiv_large_integral_result(): # GH 56676 - a = pd.Series([value], dtype="int64[pyarrow]") - expected = pd.Series([value], dtype="int64[pyarrow]") + a = pd.Series([18014398509481983, -9223372036854775808], dtype="int64[pyarrow]") + expected = pd.Series( + [18014398509481983, -9223372036854775808], dtype="int64[pyarrow]" + ) result = a // 1 tm.assert_series_equal(result, expected) From 692dde4ba392ccbca7e50bcd0fb9f0dd8419d51e Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 28 Dec 2023 23:48:08 -0500 Subject: [PATCH 08/25] cleanup --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 129f5cedb86c2..ca6ae6fbf7452 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -777,6 +777,7 @@ Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) +- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with large integral results (:issue:`56676`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) Conversion From fbae188178b8e41c244a373d562c1a68c30c7b5b Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 28 Dec 2023 23:48:46 -0500 Subject: [PATCH 09/25] whatsnew --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ca6ae6fbf7452..20e3159c934c6 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -776,8 +776,8 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) +- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` causing rounding errors for large integral results (:issue:`56676`) - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) -- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with large integral results (:issue:`56676`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) Conversion From 0b3f3c88df5fa2ad92280a90a85b65b8e995b0ef Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 28 Dec 2023 23:50:08 -0500 Subject: [PATCH 10/25] remove assert --- pandas/core/arrays/arrow/array.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b27460c8115dc..a3a6b1c180955 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -145,9 +145,6 @@ def floordiv_compat( # int8 // int64 returned int8 rather than int64. result = result.cast(left.type) else: - assert pa.types.is_floating(divided.type) or pa.types.is_decimal( - divided.type - ) result = pc.floor(divided) return result From 2024f4c887d300bd7eaf4631fa974ca4e3a99e65 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Fri, 29 Dec 2023 00:01:05 -0500 Subject: [PATCH 11/25] revert --- doc/source/whatsnew/v2.2.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 20e3159c934c6..129f5cedb86c2 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -776,7 +776,6 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) -- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` causing rounding errors for large integral results (:issue:`56676`) - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) From 31a8e1981a60bc935a7ff860791c437217bed2ad Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Fri, 29 Dec 2023 00:18:39 -0500 Subject: [PATCH 12/25] simplify comment --- pandas/core/arrays/arrow/array.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a3a6b1c180955..b0381d21ec118 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -133,11 +133,10 @@ def floordiv_compat( has_remainder = pc.not_equal(pc.multiply(divided, right), left) result = pc.if_else( pc.less(divided, 0), - # Avoid using subtract_checked which has significantly worse perf: - # https://github.com/apache/arrow/issues/37678, especially since - # integer overflow should be impossible here. Using subtract_checked - # would also incorrectly raise for -9223372036854775808 // 1, - # if integer overflow occurs, then has_remainder is false. + # Avoid using subtract_checked which would incorrectly raise + # for -9223372036854775808 // 1, because if integer overflow + # occurs, then has_remainder should be false, and overflowed + # value is discarded. pc.if_else(has_remainder, pc.subtract(divided, 1), divided), divided, ) From f7095ba3f8ec3248ab736aa998a41028d153973d Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Fri, 29 Dec 2023 00:21:33 -0500 Subject: [PATCH 13/25] simplify logic --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b0381d21ec118..daea340314c46 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -132,12 +132,12 @@ def floordiv_compat( # GH 56676, avoid storing intermediate calculating in floating point type. has_remainder = pc.not_equal(pc.multiply(divided, right), left) result = pc.if_else( - pc.less(divided, 0), + pc.and_(pc.less(divided, 0), has_remainder), # Avoid using subtract_checked which would incorrectly raise # for -9223372036854775808 // 1, because if integer overflow # occurs, then has_remainder should be false, and overflowed # value is discarded. - pc.if_else(has_remainder, pc.subtract(divided, 1), divided), + pc.subtract(divided, 1), divided, ) # Ensure compatibility with older versions of pandas where From 80a697692eba74e9abcf334c7a53594ce56b58d8 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Fri, 29 Dec 2023 00:31:01 -0500 Subject: [PATCH 14/25] simplify test --- pandas/tests/extension/test_arrow.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ea32a1d567a71..f5138e5f4854a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3249,11 +3249,8 @@ def test_arrow_floordiv_large_values(): def test_arrow_floordiv_large_integral_result(): # GH 56676 a = pd.Series([18014398509481983, -9223372036854775808], dtype="int64[pyarrow]") - expected = pd.Series( - [18014398509481983, -9223372036854775808], dtype="int64[pyarrow]" - ) result = a // 1 - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, a) def test_string_to_datetime_parsing_cast(): From c992e774b46f7d7a43df7a316131408c0f38afea Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Fri, 29 Dec 2023 02:11:32 -0500 Subject: [PATCH 15/25] fix uint64 overflow --- pandas/core/arrays/arrow/array.py | 13 ++++++++++--- pandas/tests/extension/test_arrow.py | 8 ++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index daea340314c46..e5af5fce136e3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -129,15 +129,22 @@ def floordiv_compat( ) -> pa.ChunkedArray: divided = pc.divide(left, right) if pa.types.is_integer(divided.type): - # GH 56676, avoid storing intermediate calculating in floating point type. + # GH 56676: avoid storing intermediate calculating in floating point type. has_remainder = pc.not_equal(pc.multiply(divided, right), left) result = pc.if_else( - pc.and_(pc.less(divided, 0), has_remainder), + # Pass a typed arrow scalar rather than stdlib int + # which always inferred as int64, to prevent overflow + # in case of large uint64 values. + pc.and_( + pc.less(divided, pa.scalar(0, type=divided.type)), has_remainder + ), + # GH 55561: floordiv should round towards negative infinity. + # pv.divide for integral types rounds towards 0. # Avoid using subtract_checked which would incorrectly raise # for -9223372036854775808 // 1, because if integer overflow # occurs, then has_remainder should be false, and overflowed # value is discarded. - pc.subtract(divided, 1), + pc.subtract(divided, pa.scalar(1, type=divided.type)), divided, ) # Ensure compatibility with older versions of pandas where diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f5138e5f4854a..40ef021412d7a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3253,6 +3253,14 @@ def test_arrow_floordiv_large_integral_result(): tm.assert_series_equal(result, a) +def test_arrow_floordiv_no_overflow(): + # GH 56676 + a = pd.Series([9223372036854775808], dtype="uint64[pyarrow]") + b = pd.Series([1], dtype="uint64[pyarrow]") + result = a // b + tm.assert_series_equal(result, a) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] From e56b1b30689b3ae1be4b3dc6da77e6ead81f21ff Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Mon, 1 Jan 2024 00:30:28 -0500 Subject: [PATCH 16/25] bug fix --- pandas/core/arrays/arrow/array.py | 5 ++++- pandas/tests/extension/test_arrow.py | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e5af5fce136e3..38e658a2a9315 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -136,7 +136,10 @@ def floordiv_compat( # which always inferred as int64, to prevent overflow # in case of large uint64 values. pc.and_( - pc.less(divided, pa.scalar(0, type=divided.type)), has_remainder + pc.less( + pc.bit_wise_xor(left, right), pa.scalar(0, type=divided.type) + ), + has_remainder, ), # GH 55561: floordiv should round towards negative infinity. # pv.divide for integral types rounds towards 0. diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 40ef021412d7a..768a6f9968e5f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3253,6 +3253,14 @@ def test_arrow_floordiv_large_integral_result(): tm.assert_series_equal(result, a) +def test_arrow_floordiv_larger_divisor(): + # GH 56676 + a = pd.Series([-23], dtype="int64[pyarrow]") + result = a // 24 + expected = pd.Series([-1], dtype="int64[pyarrow]") + tm.assert_series_equal(result, expected) + + def test_arrow_floordiv_no_overflow(): # GH 56676 a = pd.Series([9223372036854775808], dtype="uint64[pyarrow]") From 63bf5d816adadde2e11c6ac97355123ac11a2904 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Mon, 1 Jan 2024 02:42:14 -0500 Subject: [PATCH 17/25] fix overflow --- pandas/core/arrays/arrow/array.py | 10 ++++++++-- pandas/tests/extension/test_arrow.py | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 38e658a2a9315..91d23300eb950 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -127,8 +127,10 @@ def floordiv_compat( left: pa.ChunkedArray | pa.Array | pa.Scalar, right: pa.ChunkedArray | pa.Array | pa.Scalar, ) -> pa.ChunkedArray: - divided = pc.divide(left, right) - if pa.types.is_integer(divided.type): + if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): + # Use divide_checked to ensure cases like -9223372036854775808 // -1 + # don't silently overflow. + divided = pc.divide_checked(left, right) # GH 56676: avoid storing intermediate calculating in floating point type. has_remainder = pc.not_equal(pc.multiply(divided, right), left) result = pc.if_else( @@ -154,6 +156,10 @@ def floordiv_compat( # int8 // int64 returned int8 rather than int64. result = result.cast(left.type) else: + # Use divide instead of divide_checked to match numpy + # floordiv where divide by 0 returns infinity for floating + # point types. + divided = pc.divide(left, right) result = pc.floor(divided) return result diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 768a6f9968e5f..9512a99e4760c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3261,6 +3261,23 @@ def test_arrow_floordiv_larger_divisor(): tm.assert_series_equal(result, expected) +def test_arrow_floordiv_integral_invalid(): + # GH 56676 + a = pd.Series([-9223372036854775808], dtype="int64[pyarrow]") + with pytest.raises(pa.lib.ArrowInvalid, match="overflow"): + a // -1 + with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"): + a // 0 + + +def test_arrow_floordiv_floating_0_divisor(): + # GH 56676 + a = pd.Series([2], dtype="double[pyarrow]") + result = a // 0 + expected = pd.Series([float("inf")], dtype="double[pyarrow]") + tm.assert_series_equal(result, expected) + + def test_arrow_floordiv_no_overflow(): # GH 56676 a = pd.Series([9223372036854775808], dtype="uint64[pyarrow]") From 167507ee15845c099ba4f2c379d0a357c4dcf843 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Mon, 1 Jan 2024 02:43:05 -0500 Subject: [PATCH 18/25] fix comment --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 91d23300eb950..5fdfbab450fc3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -144,7 +144,7 @@ def floordiv_compat( has_remainder, ), # GH 55561: floordiv should round towards negative infinity. - # pv.divide for integral types rounds towards 0. + # pc.divide_checked for integral types rounds towards 0. # Avoid using subtract_checked which would incorrectly raise # for -9223372036854775808 // 1, because if integer overflow # occurs, then has_remainder should be false, and overflowed From 263b8a2416f2741e9997e2105fa76bb0e25447da Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 2 Jan 2024 22:55:36 -0500 Subject: [PATCH 19/25] fix truediv for large divsor --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 17 ++++++++++++----- pandas/tests/extension/test_arrow.py | 18 ++++++++++++++++++ 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 129f5cedb86c2..75971f4ca109e 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -776,6 +776,7 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) +- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`) - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5fdfbab450fc3..40447a24ee0c6 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -109,7 +109,7 @@ def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar - ) -> pa.ChunkedArray: + ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]: # Ensure int / int -> float mirroring Python/Numpy behavior # as pc.divide_checked(int, int) -> int if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( @@ -120,8 +120,15 @@ def cast_for_truediv( # Intentionally not using arrow_array.cast because it could be a scalar # value in reflected case, and safe=False only added to # scalar cast in pyarrow 13. - return pc.cast(arrow_array, pa.float64(), safe=False) - return arrow_array + # In arrow, common type between integral and float64 is float64, + # but integral type is safe casted to float64, to mirror python + # and numpy, we want an unsafe cast, so we cast both operands to + # to float64 before invoking arrow. + return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast( + pa_object, pa.float64(), safe=False + ) + + return arrow_array, pa_object def floordiv_compat( left: pa.ChunkedArray | pa.Array | pa.Scalar, @@ -170,8 +177,8 @@ def floordiv_compat( "rsub": lambda x, y: pc.subtract_checked(y, x), "mul": pc.multiply_checked, "rmul": lambda x, y: pc.multiply_checked(y, x), - "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y), - "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)), + "truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)), + "rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)), "floordiv": lambda x, y: floordiv_compat(x, y), "rfloordiv": lambda x, y: floordiv_compat(y, x), "mod": NotImplemented, diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9512a99e4760c..9d75801a0d0d9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3286,6 +3286,24 @@ def test_arrow_floordiv_no_overflow(): tm.assert_series_equal(result, a) +def test_arrow_true_division_large_divisor(): + # GH 56706 + a = pd.Series([0], dtype="int64[pyarrow]") + b = pd.Series([18014398509481983], dtype="int64[pyarrow]") + expected = pd.Series([0], dtype="float64[pyarrow]") + result = a / b + tm.assert_series_equal(result, expected) + + +def test_arrow_floor_division_large_divisor(): + # GH 56706 + a = pd.Series([0], dtype="int64[pyarrow]") + b = pd.Series([18014398509481983], dtype="int64[pyarrow]") + expected = pd.Series([0], dtype="int64[pyarrow]") + result = a // b + tm.assert_series_equal(result, expected) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] From c77c3f71840c9d9bcb40aa69a6ba444fe2301bfa Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 3 Jan 2024 14:55:59 -0500 Subject: [PATCH 20/25] improve unsigned // unsigned --- pandas/core/arrays/arrow/array.py | 50 +++++++++++++++++-------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 40447a24ee0c6..b6c803bae4eb2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -138,29 +138,35 @@ def floordiv_compat( # Use divide_checked to ensure cases like -9223372036854775808 // -1 # don't silently overflow. divided = pc.divide_checked(left, right) - # GH 56676: avoid storing intermediate calculating in floating point type. - has_remainder = pc.not_equal(pc.multiply(divided, right), left) - result = pc.if_else( - # Pass a typed arrow scalar rather than stdlib int - # which always inferred as int64, to prevent overflow - # in case of large uint64 values. - pc.and_( - pc.less( - pc.bit_wise_xor(left, right), pa.scalar(0, type=divided.type) + # If one operand is a signed integer, we need to ensure division + # rounds towards negative infinity instead of 0. If divided.type + # is a signed integer, that means at least one operand is signed + # otherwise divided.type would be unsigned integer. + if pa.types.is_signed_integer(divided.type): + # GH 56676: avoid storing intermediate calculating in + # floating point type. + has_remainder = pc.not_equal(pc.multiply(divided, right), left) + result = pc.if_else( + pc.and_( + pc.less( + pc.bit_wise_xor(left, right), + pa.scalar(0, type=divided.type), + ), + has_remainder, ), - has_remainder, - ), - # GH 55561: floordiv should round towards negative infinity. - # pc.divide_checked for integral types rounds towards 0. - # Avoid using subtract_checked which would incorrectly raise - # for -9223372036854775808 // 1, because if integer overflow - # occurs, then has_remainder should be false, and overflowed - # value is discarded. - pc.subtract(divided, pa.scalar(1, type=divided.type)), - divided, - ) - # Ensure compatibility with older versions of pandas where - # int8 // int64 returned int8 rather than int64. + # GH 55561: floordiv should round towards negative infinity. + # pc.divide_checked for integral types rounds towards 0. + # Avoid using subtract_checked which would incorrectly raise + # for -9223372036854775808 // 1, because if integer overflow + # occurs, then has_remainder should be false, and overflowed + # value is discarded. + pc.subtract(divided, pa.scalar(1, type=divided.type)), + divided, + ) + else: + # For 2 unsigned integer operands, integer division is + # same as floor division. + result = divided result = result.cast(left.type) else: # Use divide instead of divide_checked to match numpy From e36fb90a143b75364edd915f4c127349bb291f04 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 3 Jan 2024 16:15:02 -0500 Subject: [PATCH 21/25] improve test cases --- pandas/core/arrays/arrow/array.py | 4 ++- pandas/tests/extension/test_arrow.py | 53 +++++++++++++++++----------- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b6c803bae4eb2..87dd62370c13a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -148,11 +148,11 @@ def floordiv_compat( has_remainder = pc.not_equal(pc.multiply(divided, right), left) result = pc.if_else( pc.and_( + has_remainder, pc.less( pc.bit_wise_xor(left, right), pa.scalar(0, type=divided.type), ), - has_remainder, ), # GH 55561: floordiv should round towards negative infinity. # pc.divide_checked for integral types rounds towards 0. @@ -167,6 +167,8 @@ def floordiv_compat( # For 2 unsigned integer operands, integer division is # same as floor division. result = divided + # Ensure compatibility with older versions of pandas where + # int8 // int64 returned int8 rather than int64. result = result.cast(left.type) else: # Use divide instead of divide_checked to match numpy diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9d75801a0d0d9..7ce2e841a76f8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3246,60 +3246,71 @@ def test_arrow_floordiv_large_values(): tm.assert_series_equal(result, expected) -def test_arrow_floordiv_large_integral_result(): +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floordiv_large_integral_result(dtype): # GH 56676 - a = pd.Series([18014398509481983, -9223372036854775808], dtype="int64[pyarrow]") + a = pd.Series([18014398509481983], dtype=dtype) result = a // 1 tm.assert_series_equal(result, a) -def test_arrow_floordiv_larger_divisor(): +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_larger_divisor(pa_type): # GH 56676 - a = pd.Series([-23], dtype="int64[pyarrow]") + dtype = ArrowDtype(pa_type) + a = pd.Series([-23], dtype=dtype) result = a // 24 - expected = pd.Series([-1], dtype="int64[pyarrow]") + expected = pd.Series([-1], dtype=dtype) tm.assert_series_equal(result, expected) -def test_arrow_floordiv_integral_invalid(): +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_integral_invalid(pa_type): # GH 56676 - a = pd.Series([-9223372036854775808], dtype="int64[pyarrow]") - with pytest.raises(pa.lib.ArrowInvalid, match="overflow"): + min_value = np.iinfo(pa_type.to_pandas_dtype()).min + a = pd.Series([min_value], dtype=ArrowDtype(pa_type)) + with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"): a // -1 with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"): a // 0 -def test_arrow_floordiv_floating_0_divisor(): +@pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR) +def test_arrow_floordiv_floating_0_divisor(dtype): # GH 56676 - a = pd.Series([2], dtype="double[pyarrow]") + a = pd.Series([2], dtype=dtype) result = a // 0 - expected = pd.Series([float("inf")], dtype="double[pyarrow]") + expected = pd.Series([float("inf")], dtype=dtype) tm.assert_series_equal(result, expected) -def test_arrow_floordiv_no_overflow(): +@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) +def test_arrow_integral_floordiv_large_values(pa_type): # GH 56676 - a = pd.Series([9223372036854775808], dtype="uint64[pyarrow]") - b = pd.Series([1], dtype="uint64[pyarrow]") + max_value = np.iinfo(pa_type.to_pandas_dtype()).max + dtype = ArrowDtype(pa_type) + a = pd.Series([max_value], dtype=dtype) + b = pd.Series([1], dtype=dtype) result = a // b tm.assert_series_equal(result, a) -def test_arrow_true_division_large_divisor(): +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_true_division_large_divisor(dtype): # GH 56706 - a = pd.Series([0], dtype="int64[pyarrow]") - b = pd.Series([18014398509481983], dtype="int64[pyarrow]") + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) expected = pd.Series([0], dtype="float64[pyarrow]") result = a / b tm.assert_series_equal(result, expected) -def test_arrow_floor_division_large_divisor(): +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floor_division_large_divisor(dtype): # GH 56706 - a = pd.Series([0], dtype="int64[pyarrow]") - b = pd.Series([18014398509481983], dtype="int64[pyarrow]") - expected = pd.Series([0], dtype="int64[pyarrow]") + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype=dtype) result = a // b tm.assert_series_equal(result, expected) From 47c44747c2ab884a5f5a13de68dc80cdcf8e2324 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 4 Jan 2024 16:32:23 -0500 Subject: [PATCH 22/25] negative operand condition --- pandas/core/arrays/arrow/array.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 87dd62370c13a..e922db4dd2c95 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -149,9 +149,15 @@ def floordiv_compat( result = pc.if_else( pc.and_( has_remainder, - pc.less( - pc.bit_wise_xor(left, right), - pa.scalar(0, type=divided.type), + pa.compute.xor( + pa.compute.less( + left, + pa.scalar(0, type=left.type), + ), + pa.compute.less( + right, + pa.scalar(0, type=right.type), + ), ), ), # GH 55561: floordiv should round towards negative infinity. From ae2afa34c945251792457924e9d78e0c8dec71be Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 4 Jan 2024 16:37:20 -0500 Subject: [PATCH 23/25] Revert "negative operand condition" This reverts commit 47c44747c2ab884a5f5a13de68dc80cdcf8e2324. --- pandas/core/arrays/arrow/array.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e922db4dd2c95..87dd62370c13a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -149,15 +149,9 @@ def floordiv_compat( result = pc.if_else( pc.and_( has_remainder, - pa.compute.xor( - pa.compute.less( - left, - pa.scalar(0, type=left.type), - ), - pa.compute.less( - right, - pa.scalar(0, type=right.type), - ), + pc.less( + pc.bit_wise_xor(left, right), + pa.scalar(0, type=divided.type), ), ), # GH 55561: floordiv should round towards negative infinity. From 9913c428b5899e7d9769f16b0a6e6991f5b7053f Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 4 Jan 2024 16:38:01 -0500 Subject: [PATCH 24/25] fix readability --- pandas/core/arrays/arrow/array.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 87dd62370c13a..99bd640d9f546 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -146,13 +146,14 @@ def floordiv_compat( # GH 56676: avoid storing intermediate calculating in # floating point type. has_remainder = pc.not_equal(pc.multiply(divided, right), left) + has_one_negative_operand = pc.less( + pc.bit_wise_xor(left, right), + pa.scalar(0, type=divided.type), + ) result = pc.if_else( pc.and_( has_remainder, - pc.less( - pc.bit_wise_xor(left, right), - pa.scalar(0, type=divided.type), - ), + has_one_negative_operand, ), # GH 55561: floordiv should round towards negative infinity. # pc.divide_checked for integral types rounds towards 0. From 51bd7f39242d016fbcb68f3130b0cab7318c474a Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 4 Jan 2024 20:29:30 -0500 Subject: [PATCH 25/25] cleanup comments --- pandas/core/arrays/arrow/array.py | 34 +++++-------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 99bd640d9f546..3633e3f5d75d8 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -115,15 +115,8 @@ def cast_for_truediv( if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( pa_object.type ): + # GH: 56645. # https://github.com/apache/arrow/issues/35563 - # Arrow does not allow safe casting large integral values to float64. - # Intentionally not using arrow_array.cast because it could be a scalar - # value in reflected case, and safe=False only added to - # scalar cast in pyarrow 13. - # In arrow, common type between integral and float64 is float64, - # but integral type is safe casted to float64, to mirror python - # and numpy, we want an unsafe cast, so we cast both operands to - # to float64 before invoking arrow. return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast( pa_object, pa.float64(), safe=False ) @@ -134,17 +127,12 @@ def floordiv_compat( left: pa.ChunkedArray | pa.Array | pa.Scalar, right: pa.ChunkedArray | pa.Array | pa.Scalar, ) -> pa.ChunkedArray: + # TODO: Replace with pyarrow floordiv kernel. + # https://github.com/apache/arrow/issues/39386 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): - # Use divide_checked to ensure cases like -9223372036854775808 // -1 - # don't silently overflow. divided = pc.divide_checked(left, right) - # If one operand is a signed integer, we need to ensure division - # rounds towards negative infinity instead of 0. If divided.type - # is a signed integer, that means at least one operand is signed - # otherwise divided.type would be unsigned integer. if pa.types.is_signed_integer(divided.type): - # GH 56676: avoid storing intermediate calculating in - # floating point type. + # GH 56676 has_remainder = pc.not_equal(pc.multiply(divided, right), left) has_one_negative_operand = pc.less( pc.bit_wise_xor(left, right), @@ -155,26 +143,14 @@ def floordiv_compat( has_remainder, has_one_negative_operand, ), - # GH 55561: floordiv should round towards negative infinity. - # pc.divide_checked for integral types rounds towards 0. - # Avoid using subtract_checked which would incorrectly raise - # for -9223372036854775808 // 1, because if integer overflow - # occurs, then has_remainder should be false, and overflowed - # value is discarded. + # GH: 55561 pc.subtract(divided, pa.scalar(1, type=divided.type)), divided, ) else: - # For 2 unsigned integer operands, integer division is - # same as floor division. result = divided - # Ensure compatibility with older versions of pandas where - # int8 // int64 returned int8 rather than int64. result = result.cast(left.type) else: - # Use divide instead of divide_checked to match numpy - # floordiv where divide by 0 returns infinity for floating - # point types. divided = pc.divide(left, right) result = pc.floor(divided) return result