Skip to content

Commit 54dbe45

Browse files
Backport PR #56677 on branch 2.2.x (Fix integral truediv and floordiv for pyarrow types with large divisor and avoid floating points for floordiv) (#56744)
Backport PR #56677: Fix integral truediv and floordiv for pyarrow types with large divisor and avoid floating points for floordiv Co-authored-by: rohanjain101 <[email protected]>
1 parent 97eb331 commit 54dbe45

File tree

3 files changed

+104
-14
lines changed

3 files changed

+104
-14
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,7 @@ Timezones
786786
Numeric
787787
^^^^^^^
788788
- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`)
789+
- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`)
789790
- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`)
790791
- Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`)
791792

pandas/core/arrays/arrow/array.py

+33-13
Original file line numberDiff line numberDiff line change
@@ -109,30 +109,50 @@
109109

110110
def cast_for_truediv(
111111
arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar
112-
) -> pa.ChunkedArray:
112+
) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]:
113113
# Ensure int / int -> float mirroring Python/Numpy behavior
114114
# as pc.divide_checked(int, int) -> int
115115
if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(
116116
pa_object.type
117117
):
118+
# GH: 56645.
118119
# https://github.com/apache/arrow/issues/35563
119-
# Arrow does not allow safe casting large integral values to float64.
120-
# Intentionally not using arrow_array.cast because it could be a scalar
121-
# value in reflected case, and safe=False only added to
122-
# scalar cast in pyarrow 13.
123-
return pc.cast(arrow_array, pa.float64(), safe=False)
124-
return arrow_array
120+
return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast(
121+
pa_object, pa.float64(), safe=False
122+
)
123+
124+
return arrow_array, pa_object
125125

126126
def floordiv_compat(
127127
left: pa.ChunkedArray | pa.Array | pa.Scalar,
128128
right: pa.ChunkedArray | pa.Array | pa.Scalar,
129129
) -> pa.ChunkedArray:
130-
# Ensure int // int -> int mirroring Python/Numpy behavior
131-
# as pc.floor(pc.divide_checked(int, int)) -> float
132-
converted_left = cast_for_truediv(left, right)
133-
result = pc.floor(pc.divide(converted_left, right))
130+
# TODO: Replace with pyarrow floordiv kernel.
131+
# https://github.com/apache/arrow/issues/39386
134132
if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):
133+
divided = pc.divide_checked(left, right)
134+
if pa.types.is_signed_integer(divided.type):
135+
# GH 56676
136+
has_remainder = pc.not_equal(pc.multiply(divided, right), left)
137+
has_one_negative_operand = pc.less(
138+
pc.bit_wise_xor(left, right),
139+
pa.scalar(0, type=divided.type),
140+
)
141+
result = pc.if_else(
142+
pc.and_(
143+
has_remainder,
144+
has_one_negative_operand,
145+
),
146+
# GH: 55561
147+
pc.subtract(divided, pa.scalar(1, type=divided.type)),
148+
divided,
149+
)
150+
else:
151+
result = divided
135152
result = result.cast(left.type)
153+
else:
154+
divided = pc.divide(left, right)
155+
result = pc.floor(divided)
136156
return result
137157

138158
ARROW_ARITHMETIC_FUNCS = {
@@ -142,8 +162,8 @@ def floordiv_compat(
142162
"rsub": lambda x, y: pc.subtract_checked(y, x),
143163
"mul": pc.multiply_checked,
144164
"rmul": lambda x, y: pc.multiply_checked(y, x),
145-
"truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y),
146-
"rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)),
165+
"truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)),
166+
"rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)),
147167
"floordiv": lambda x, y: floordiv_compat(x, y),
148168
"rfloordiv": lambda x, y: floordiv_compat(y, x),
149169
"mod": NotImplemented,

pandas/tests/extension/test_arrow.py

+70-1
Original file line numberDiff line numberDiff line change
@@ -3260,13 +3260,82 @@ def test_arrow_floordiv():
32603260

32613261

32623262
def test_arrow_floordiv_large_values():
3263-
# GH 55561
3263+
# GH 56645
32643264
a = pd.Series([1425801600000000000], dtype="int64[pyarrow]")
32653265
expected = pd.Series([1425801600000], dtype="int64[pyarrow]")
32663266
result = a // 1_000_000
32673267
tm.assert_series_equal(result, expected)
32683268

32693269

3270+
@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
3271+
def test_arrow_floordiv_large_integral_result(dtype):
3272+
# GH 56676
3273+
a = pd.Series([18014398509481983], dtype=dtype)
3274+
result = a // 1
3275+
tm.assert_series_equal(result, a)
3276+
3277+
3278+
@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES)
3279+
def test_arrow_floordiv_larger_divisor(pa_type):
3280+
# GH 56676
3281+
dtype = ArrowDtype(pa_type)
3282+
a = pd.Series([-23], dtype=dtype)
3283+
result = a // 24
3284+
expected = pd.Series([-1], dtype=dtype)
3285+
tm.assert_series_equal(result, expected)
3286+
3287+
3288+
@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES)
3289+
def test_arrow_floordiv_integral_invalid(pa_type):
3290+
# GH 56676
3291+
min_value = np.iinfo(pa_type.to_pandas_dtype()).min
3292+
a = pd.Series([min_value], dtype=ArrowDtype(pa_type))
3293+
with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"):
3294+
a // -1
3295+
with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"):
3296+
a // 0
3297+
3298+
3299+
@pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR)
3300+
def test_arrow_floordiv_floating_0_divisor(dtype):
3301+
# GH 56676
3302+
a = pd.Series([2], dtype=dtype)
3303+
result = a // 0
3304+
expected = pd.Series([float("inf")], dtype=dtype)
3305+
tm.assert_series_equal(result, expected)
3306+
3307+
3308+
@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES)
3309+
def test_arrow_integral_floordiv_large_values(pa_type):
3310+
# GH 56676
3311+
max_value = np.iinfo(pa_type.to_pandas_dtype()).max
3312+
dtype = ArrowDtype(pa_type)
3313+
a = pd.Series([max_value], dtype=dtype)
3314+
b = pd.Series([1], dtype=dtype)
3315+
result = a // b
3316+
tm.assert_series_equal(result, a)
3317+
3318+
3319+
@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
3320+
def test_arrow_true_division_large_divisor(dtype):
3321+
# GH 56706
3322+
a = pd.Series([0], dtype=dtype)
3323+
b = pd.Series([18014398509481983], dtype=dtype)
3324+
expected = pd.Series([0], dtype="float64[pyarrow]")
3325+
result = a / b
3326+
tm.assert_series_equal(result, expected)
3327+
3328+
3329+
@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
3330+
def test_arrow_floor_division_large_divisor(dtype):
3331+
# GH 56706
3332+
a = pd.Series([0], dtype=dtype)
3333+
b = pd.Series([18014398509481983], dtype=dtype)
3334+
expected = pd.Series([0], dtype=dtype)
3335+
result = a // b
3336+
tm.assert_series_equal(result, expected)
3337+
3338+
32703339
def test_string_to_datetime_parsing_cast():
32713340
# GH 56266
32723341
string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"]

0 commit comments

Comments
 (0)