Skip to content

BUG: Slice Arrow buffer before passing it to numpy (#40896) #41046

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Apr 28, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion pandas/core/arrays/_arrow_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
on the buffers of the Array.

At the moment pyarrow.BooleanArray is not supported.

Parameters
----------
arr : pyarrow.Array
Expand All @@ -25,8 +27,16 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
Tuple of two numpy arrays with the raw data (with specified dtype) and
a boolean mask (validity mask, so False means missing)
"""
dtype = np.dtype(dtype)

buflist = arr.buffers()
data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)]
# Since Arrow buffers might contain padding and the data might be offset,
# the buffer gets sliced here before handing it to numpy.
# See also https://github.com/pandas-dev/pandas/issues/40896
offset = arr.offset * dtype.itemsize
length = len(arr) * dtype.itemsize
data_buf = buflist[1][offset : offset + length]
data = np.frombuffer(data_buf, dtype=dtype)
bitmask = buflist[0]
if bitmask is not None:
mask = pyarrow.BooleanArray.from_buffers(
Expand Down
59 changes: 59 additions & 0 deletions pandas/tests/arrays/test_arrow_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import numpy as np

import pandas.util._test_decorators as td

import pandas._testing as tm


@td.skip_if_no("pyarrow")
def test_pyarrow_array_to_numpy_and_mask():
"""
Test conversion from pyarrow array to numpy array.

Also modifies the pyarrow buffer to contain padding and offset, which are
considered valid buffers by pyarrow.
See also https://github.com/pandas-dev/pandas/issues/40896
"""
import pyarrow as pa

from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask

dtype = np.int32
pa_array = pa.array([0, 1, 2], type=pa.int32())
np_expected = np.array([0, 1, 2], dtype=np.int32)
mask_expected = np.array([True, True, True])

data, mask = pyarrow_array_to_numpy_and_mask(pa_array, dtype)
tm.assert_numpy_array_equal(data, np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)

mask_buffer = pa_array.buffers()[0]
data_buffer = pa_array.buffers()[1].to_pybytes()

# Add trailing padding to the buffer.
data_buffer_trail = pa.py_buffer(data_buffer + b"\x00")
pa_array_trail = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer, data_buffer_trail],
offset=pa_array.offset,
)
pa_array_trail.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, dtype)
tm.assert_numpy_array_equal(data, np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)

# Add offset to the buffer.
offset = b"\x00" * (pa_array.type.bit_width // 8)
data_buffer_offset = pa.py_buffer(offset + data_buffer)
mask_buffer_offset = pa.py_buffer(b"\x0F")
pa_array_offset = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer_offset, data_buffer_offset],
offset=pa_array.offset + 1,
)
pa_array_offset.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, dtype)
tm.assert_numpy_array_equal(data, np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)