Skip to content

Fix integral truediv and floordiv for pyarrow types with large divisor and avoid floating points for floordiv #56677

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Jan 5, 2024
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,7 @@ Timezones
Numeric
^^^^^^^
- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`)
- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`)
- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`)
- Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`)

Expand Down
53 changes: 44 additions & 9 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@

def cast_for_truediv(
arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar
) -> pa.ChunkedArray:
) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]:
# Ensure int / int -> float mirroring Python/Numpy behavior
# as pc.divide_checked(int, int) -> int
if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(
Expand All @@ -120,19 +120,54 @@ def cast_for_truediv(
# Intentionally not using arrow_array.cast because it could be a scalar
# value in reflected case, and safe=False only added to
# scalar cast in pyarrow 13.
return pc.cast(arrow_array, pa.float64(), safe=False)
return arrow_array
# In arrow, common type between integral and float64 is float64,
# but integral type is safe casted to float64, to mirror python
# and numpy, we want an unsafe cast, so we cast both operands to
# to float64 before invoking arrow.
return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast(
pa_object, pa.float64(), safe=False
)

return arrow_array, pa_object

def floordiv_compat(
left: pa.ChunkedArray | pa.Array | pa.Scalar,
right: pa.ChunkedArray | pa.Array | pa.Scalar,
) -> pa.ChunkedArray:
# Ensure int // int -> int mirroring Python/Numpy behavior
# as pc.floor(pc.divide_checked(int, int)) -> float
converted_left = cast_for_truediv(left, right)
result = pc.floor(pc.divide(converted_left, right))
if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):
# Use divide_checked to ensure cases like -9223372036854775808 // -1
# don't silently overflow.
divided = pc.divide_checked(left, right)
# GH 56676: avoid storing intermediate calculating in floating point type.
has_remainder = pc.not_equal(pc.multiply(divided, right), left)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO floordiv would be a useful compute kernel in arrow, especially since python and numpy both support it. I opened apache/arrow#39386, if added would make this code much simpler (and probably perform better).

result = pc.if_else(
# Pass a typed arrow scalar rather than stdlib int
# which always inferred as int64, to prevent overflow
# in case of large uint64 values.
pc.and_(
pc.less(
pc.bit_wise_xor(left, right), pa.scalar(0, type=divided.type)
),
has_remainder,
),
# GH 55561: floordiv should round towards negative infinity.
# pc.divide_checked for integral types rounds towards 0.
# Avoid using subtract_checked which would incorrectly raise
# for -9223372036854775808 // 1, because if integer overflow
# occurs, then has_remainder should be false, and overflowed
# value is discarded.
pc.subtract(divided, pa.scalar(1, type=divided.type)),
divided,
)
# Ensure compatibility with older versions of pandas where
# int8 // int64 returned int8 rather than int64.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this something that can be changed? If so, the cast to left.type can be removed, result.type is guaranteed to already be integral.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually don't think this old behavior is desirable, for example:

>>> a = pd.Series([-128], dtype="int8[pyarrow]")
>>> b = pd.Series([-1], dtype="int64[pyarrow]")
>>> a // b

With this cast, this operation fails with overflow error, because 128 can't fit in an int8. In numpy, it looks like this operation promotes to common type of int64:

>>> a = pd.Series([-128], dtype="int8")
>>> b = pd.Series([-1], dtype="int64")
>>> a // b
0    128
dtype: int64
>>>

So I think this cast should be removed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like there's comments in the test about ArrowExtensionArray not promoting, so maybe this is the intended behavior? Restored the cast.

result = result.cast(left.type)
else:
# Use divide instead of divide_checked to match numpy
# floordiv where divide by 0 returns infinity for floating
# point types.
divided = pc.divide(left, right)
result = pc.floor(divided)
return result

ARROW_ARITHMETIC_FUNCS = {
Expand All @@ -142,8 +177,8 @@ def floordiv_compat(
"rsub": lambda x, y: pc.subtract_checked(y, x),
"mul": pc.multiply_checked,
"rmul": lambda x, y: pc.multiply_checked(y, x),
"truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y),
"rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)),
"truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)),
"rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)),
"floordiv": lambda x, y: floordiv_compat(x, y),
"rfloordiv": lambda x, y: floordiv_compat(y, x),
"mod": NotImplemented,
Expand Down
60 changes: 59 additions & 1 deletion pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3239,13 +3239,71 @@ def test_arrow_floordiv():


def test_arrow_floordiv_large_values():
# GH 55561
# GH 56645
a = pd.Series([1425801600000000000], dtype="int64[pyarrow]")
expected = pd.Series([1425801600000], dtype="int64[pyarrow]")
result = a // 1_000_000
tm.assert_series_equal(result, expected)


def test_arrow_floordiv_large_integral_result():
# GH 56676
a = pd.Series([18014398509481983, -9223372036854775808], dtype="int64[pyarrow]")
result = a // 1
tm.assert_series_equal(result, a)


def test_arrow_floordiv_larger_divisor():
# GH 56676
a = pd.Series([-23], dtype="int64[pyarrow]")
result = a // 24
expected = pd.Series([-1], dtype="int64[pyarrow]")
tm.assert_series_equal(result, expected)


def test_arrow_floordiv_integral_invalid():
# GH 56676
a = pd.Series([-9223372036854775808], dtype="int64[pyarrow]")
with pytest.raises(pa.lib.ArrowInvalid, match="overflow"):
a // -1
with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"):
a // 0


def test_arrow_floordiv_floating_0_divisor():
# GH 56676
a = pd.Series([2], dtype="double[pyarrow]")
result = a // 0
expected = pd.Series([float("inf")], dtype="double[pyarrow]")
tm.assert_series_equal(result, expected)


def test_arrow_floordiv_no_overflow():
# GH 56676
a = pd.Series([9223372036854775808], dtype="uint64[pyarrow]")
b = pd.Series([1], dtype="uint64[pyarrow]")
result = a // b
tm.assert_series_equal(result, a)


def test_arrow_true_division_large_divisor():
# GH 56706
a = pd.Series([0], dtype="int64[pyarrow]")
b = pd.Series([18014398509481983], dtype="int64[pyarrow]")
expected = pd.Series([0], dtype="float64[pyarrow]")
result = a / b
tm.assert_series_equal(result, expected)


def test_arrow_floor_division_large_divisor():
# GH 56706
a = pd.Series([0], dtype="int64[pyarrow]")
b = pd.Series([18014398509481983], dtype="int64[pyarrow]")
expected = pd.Series([0], dtype="int64[pyarrow]")
result = a // b
tm.assert_series_equal(result, expected)


def test_string_to_datetime_parsing_cast():
# GH 56266
string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"]
Expand Down