Skip to content

Commit 77b9c29

Browse files
rohanjain101Rohan Jain
authored andcommitted
Fix integral truediv and floordiv for pyarrow types with large divisor and avoid floating points for floordiv (pandas-dev#56677)
* avoid floating points for integral floor division * comment * typo * gh reference * improve test * fix comment * cleanup * cleanup * whatsnew * remove assert * revert * simplify comment * simplify logic * simplify test * fix uint64 overflow * bug fix * fix overflow * fix comment * fix truediv for large divsor * improve unsigned // unsigned * improve test cases * negative operand condition * Revert "negative operand condition" This reverts commit 47c4474. * fix readability * cleanup comments --------- Co-authored-by: Rohan Jain <[email protected]>
1 parent 3c0f1f7 commit 77b9c29

File tree

3 files changed

+104
-14
lines changed

3 files changed

+104
-14
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,7 @@ Timezones
786786
Numeric
787787
^^^^^^^
788788
- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`)
789+
- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`)
789790
- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`)
790791
- Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`)
791792

pandas/core/arrays/arrow/array.py

+33-13
Original file line numberDiff line numberDiff line change
@@ -110,30 +110,50 @@
110110

111111
def cast_for_truediv(
112112
arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar
113-
) -> pa.ChunkedArray:
113+
) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]:
114114
# Ensure int / int -> float mirroring Python/Numpy behavior
115115
# as pc.divide_checked(int, int) -> int
116116
if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(
117117
pa_object.type
118118
):
119+
# GH: 56645.
119120
# https://github.com/apache/arrow/issues/35563
120-
# Arrow does not allow safe casting large integral values to float64.
121-
# Intentionally not using arrow_array.cast because it could be a scalar
122-
# value in reflected case, and safe=False only added to
123-
# scalar cast in pyarrow 13.
124-
return pc.cast(arrow_array, pa.float64(), safe=False)
125-
return arrow_array
121+
return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast(
122+
pa_object, pa.float64(), safe=False
123+
)
124+
125+
return arrow_array, pa_object
126126

127127
def floordiv_compat(
128128
left: pa.ChunkedArray | pa.Array | pa.Scalar,
129129
right: pa.ChunkedArray | pa.Array | pa.Scalar,
130130
) -> pa.ChunkedArray:
131-
# Ensure int // int -> int mirroring Python/Numpy behavior
132-
# as pc.floor(pc.divide_checked(int, int)) -> float
133-
converted_left = cast_for_truediv(left, right)
134-
result = pc.floor(pc.divide(converted_left, right))
131+
# TODO: Replace with pyarrow floordiv kernel.
132+
# https://github.com/apache/arrow/issues/39386
135133
if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):
134+
divided = pc.divide_checked(left, right)
135+
if pa.types.is_signed_integer(divided.type):
136+
# GH 56676
137+
has_remainder = pc.not_equal(pc.multiply(divided, right), left)
138+
has_one_negative_operand = pc.less(
139+
pc.bit_wise_xor(left, right),
140+
pa.scalar(0, type=divided.type),
141+
)
142+
result = pc.if_else(
143+
pc.and_(
144+
has_remainder,
145+
has_one_negative_operand,
146+
),
147+
# GH: 55561
148+
pc.subtract(divided, pa.scalar(1, type=divided.type)),
149+
divided,
150+
)
151+
else:
152+
result = divided
136153
result = result.cast(left.type)
154+
else:
155+
divided = pc.divide(left, right)
156+
result = pc.floor(divided)
137157
return result
138158

139159
ARROW_ARITHMETIC_FUNCS = {
@@ -143,8 +163,8 @@ def floordiv_compat(
143163
"rsub": lambda x, y: pc.subtract_checked(y, x),
144164
"mul": pc.multiply_checked,
145165
"rmul": lambda x, y: pc.multiply_checked(y, x),
146-
"truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y),
147-
"rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)),
166+
"truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)),
167+
"rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)),
148168
"floordiv": lambda x, y: floordiv_compat(x, y),
149169
"rfloordiv": lambda x, y: floordiv_compat(y, x),
150170
"mod": NotImplemented,

pandas/tests/extension/test_arrow.py

+70-1
Original file line numberDiff line numberDiff line change
@@ -3253,13 +3253,82 @@ def test_arrow_floordiv():
32533253

32543254

32553255
def test_arrow_floordiv_large_values():
3256-
# GH 55561
3256+
# GH 56645
32573257
a = pd.Series([1425801600000000000], dtype="int64[pyarrow]")
32583258
expected = pd.Series([1425801600000], dtype="int64[pyarrow]")
32593259
result = a // 1_000_000
32603260
tm.assert_series_equal(result, expected)
32613261

32623262

3263+
@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
3264+
def test_arrow_floordiv_large_integral_result(dtype):
3265+
# GH 56676
3266+
a = pd.Series([18014398509481983], dtype=dtype)
3267+
result = a // 1
3268+
tm.assert_series_equal(result, a)
3269+
3270+
3271+
@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES)
3272+
def test_arrow_floordiv_larger_divisor(pa_type):
3273+
# GH 56676
3274+
dtype = ArrowDtype(pa_type)
3275+
a = pd.Series([-23], dtype=dtype)
3276+
result = a // 24
3277+
expected = pd.Series([-1], dtype=dtype)
3278+
tm.assert_series_equal(result, expected)
3279+
3280+
3281+
@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES)
3282+
def test_arrow_floordiv_integral_invalid(pa_type):
3283+
# GH 56676
3284+
min_value = np.iinfo(pa_type.to_pandas_dtype()).min
3285+
a = pd.Series([min_value], dtype=ArrowDtype(pa_type))
3286+
with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"):
3287+
a // -1
3288+
with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"):
3289+
a // 0
3290+
3291+
3292+
@pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR)
3293+
def test_arrow_floordiv_floating_0_divisor(dtype):
3294+
# GH 56676
3295+
a = pd.Series([2], dtype=dtype)
3296+
result = a // 0
3297+
expected = pd.Series([float("inf")], dtype=dtype)
3298+
tm.assert_series_equal(result, expected)
3299+
3300+
3301+
@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES)
3302+
def test_arrow_integral_floordiv_large_values(pa_type):
3303+
# GH 56676
3304+
max_value = np.iinfo(pa_type.to_pandas_dtype()).max
3305+
dtype = ArrowDtype(pa_type)
3306+
a = pd.Series([max_value], dtype=dtype)
3307+
b = pd.Series([1], dtype=dtype)
3308+
result = a // b
3309+
tm.assert_series_equal(result, a)
3310+
3311+
3312+
@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
3313+
def test_arrow_true_division_large_divisor(dtype):
3314+
# GH 56706
3315+
a = pd.Series([0], dtype=dtype)
3316+
b = pd.Series([18014398509481983], dtype=dtype)
3317+
expected = pd.Series([0], dtype="float64[pyarrow]")
3318+
result = a / b
3319+
tm.assert_series_equal(result, expected)
3320+
3321+
3322+
@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
3323+
def test_arrow_floor_division_large_divisor(dtype):
3324+
# GH 56706
3325+
a = pd.Series([0], dtype=dtype)
3326+
b = pd.Series([18014398509481983], dtype=dtype)
3327+
expected = pd.Series([0], dtype=dtype)
3328+
result = a // b
3329+
tm.assert_series_equal(result, expected)
3330+
3331+
32633332
def test_string_to_datetime_parsing_cast():
32643333
# GH 56266
32653334
string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"]

0 commit comments

Comments
 (0)