Skip to content

Commit 0b4ac6f

Browse files
BUG: Slice Arrow buffer before passing it to numpy (pandas-dev#40896)
Add Arrow buffer slicing before handing it over to numpy which is needed in case the Arrow buffer contains padding or offset.
1 parent 4e16e4f commit 0b4ac6f

File tree

2 files changed

+68
-1
lines changed

2 files changed

+68
-1
lines changed

pandas/core/arrays/_arrow_utils.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
1414
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
1515
on the buffers of the Array.
1616
17+
At the moment pyarrow.BooleanArray is not supported.
18+
1719
Parameters
1820
----------
1921
arr : pyarrow.Array
@@ -25,8 +27,15 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
2527
Tuple of two numpy arrays with the raw data (with specified dtype) and
2628
a boolean mask (validity mask, so False means missing)
2729
"""
30+
2831
buflist = arr.buffers()
29-
data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)]
32+
# Since Arrow buffers might contain padding and the data might be offset,
33+
# the buffer gets sliced here before handing it to numpy.
34+
# See also https://github.com/pandas-dev/pandas/issues/40896
35+
offset = arr.offset * arr.type.bit_width // 8
36+
length = len(arr) * arr.type.bit_width // 8
37+
data_buf = buflist[1][offset : offset + length]
38+
data = np.frombuffer(data_buf, dtype=dtype)
3039
bitmask = buflist[0]
3140
if bitmask is not None:
3241
mask = pyarrow.BooleanArray.from_buffers(
+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import numpy as np
2+
3+
import pandas.util._test_decorators as td
4+
5+
import pandas._testing as tm
6+
7+
8+
@td.skip_if_no("pyarrow")
9+
def test_pyarrow_array_to_numpy_and_mask(dtype, pa_array, np_expected, mask_expected):
10+
"""
11+
Test conversion from pyarrow array to numpy array.
12+
13+
Also modifies the pyarrow buffer to contain padding and offset, which are
14+
considered valid buffers by pyarrow.
15+
See also https://github.com/pandas-dev/pandas/issues/40896
16+
"""
17+
import pyarrow as pa
18+
19+
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
20+
21+
dtype = np.int32
22+
pa_array = pa.array([0, 1, 2], type=pa.int32())
23+
np_expected = np.array([0, 1, 2], dtype=np.int32)
24+
mask_expected = np.array([True, True, True])
25+
26+
data, mask = pyarrow_array_to_numpy_and_mask(pa_array, dtype)
27+
tm.assert_numpy_array_equal(data, np_expected)
28+
assert (mask == mask_expected).all()
29+
30+
mask_buffer = pa_array.buffers()[0]
31+
data_buffer = pa_array.buffers()[1].to_pybytes()
32+
33+
# Add trailing padding to the buffer.
34+
data_buffer_trail = pa.py_buffer(data_buffer + b"\x00")
35+
pa_array_trail = pa.Array.from_buffers(
36+
type=pa_array.type,
37+
length=len(pa_array),
38+
buffers=[mask_buffer, data_buffer_trail],
39+
offset=pa_array.offset,
40+
)
41+
pa_array_trail.validate()
42+
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, dtype)
43+
tm.assert_numpy_array_equal(data, np_expected)
44+
assert (mask == mask_expected).all()
45+
46+
# Add offset to the buffer.
47+
offset = b"\x00" * (pa_array.type.bit_width // 8)
48+
data_buffer_offset = pa.py_buffer(offset + data_buffer)
49+
pa_array_offset = pa.Array.from_buffers(
50+
type=pa_array.type,
51+
length=len(pa_array),
52+
buffers=[mask_buffer, data_buffer_offset],
53+
offset=pa_array.offset + 1,
54+
)
55+
pa_array_offset.validate()
56+
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, dtype)
57+
tm.assert_numpy_array_equal(data, np_expected)
58+
assert (mask == mask_expected).all()

0 commit comments

Comments
 (0)