Skip to content

Commit 1586d50

Browse files
BUG: Slice Arrow buffer before passing it to numpy (pandas-dev#40896)
Move tests to pandas/tests/arrays/masked/test_arrow_compat.py Add more dtypes to test.
1 parent fd94972 commit 1586d50

File tree

2 files changed

+100
-59
lines changed

2 files changed

+100
-59
lines changed

pandas/tests/arrays/masked/test_arrow_compat.py

+100
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import numpy as np
12
import pytest
23

34
import pandas.util._test_decorators as td
@@ -64,3 +65,102 @@ def test_arrow_sliced():
6465
result = table.slice(2, None).to_pandas()
6566
expected = df.iloc[2:].reset_index(drop=True)
6667
tm.assert_frame_equal(result, expected)
68+
69+
70+
@pytest.fixture
71+
def np_dtype_to_arrays(request):
72+
import pyarrow as pa
73+
74+
np_dtype = request.param
75+
pa_type = pa.from_numpy_dtype(np_dtype)
76+
77+
pa_array = pa.array([0, 1, 2], type=pa_type)
78+
np_expected = np.array([0, 1, 2], dtype=np_dtype)
79+
mask_expected = np.array([True, True, True])
80+
return np_dtype, pa_array, np_expected, mask_expected
81+
82+
83+
@td.skip_if_no("pyarrow")
84+
@pytest.mark.parametrize(
85+
"np_dtype_to_arrays",
86+
(
87+
[
88+
np.int8(),
89+
np.int16(),
90+
np.int32(),
91+
np.int64(),
92+
np.uint8(),
93+
np.uint16(),
94+
np.uint32(),
95+
np.uint64(),
96+
np.float32(),
97+
np.float64(),
98+
]
99+
),
100+
indirect=True,
101+
)
102+
def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
103+
"""
104+
Test conversion from pyarrow array to numpy array.
105+
106+
Modifies the pyarrow buffer to contain padding and offset, which are
107+
considered valid buffers by pyarrow.
108+
109+
Also tests empty pyarrow arrays with non empty buffers.
110+
See https://github.com/pandas-dev/pandas/issues/40896
111+
"""
112+
import pyarrow as pa
113+
114+
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
115+
116+
np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays
117+
data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype)
118+
tm.assert_numpy_array_equal(data, np_expected)
119+
tm.assert_numpy_array_equal(mask, mask_expected)
120+
121+
mask_buffer = pa_array.buffers()[0]
122+
data_buffer = pa_array.buffers()[1]
123+
data_buffer_bytes = pa_array.buffers()[1].to_pybytes()
124+
125+
# Add trailing padding to the buffer.
126+
data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00")
127+
pa_array_trail = pa.Array.from_buffers(
128+
type=pa_array.type,
129+
length=len(pa_array),
130+
buffers=[mask_buffer, data_buffer_trail],
131+
offset=pa_array.offset,
132+
)
133+
pa_array_trail.validate()
134+
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype)
135+
tm.assert_numpy_array_equal(data, np_expected)
136+
tm.assert_numpy_array_equal(mask, mask_expected)
137+
138+
# Add offset to the buffer.
139+
offset = b"\x00" * (pa_array.type.bit_width // 8)
140+
data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes)
141+
mask_buffer_offset = pa.py_buffer(b"\x0F")
142+
pa_array_offset = pa.Array.from_buffers(
143+
type=pa_array.type,
144+
length=len(pa_array),
145+
buffers=[mask_buffer_offset, data_buffer_offset],
146+
offset=pa_array.offset + 1,
147+
)
148+
pa_array_offset.validate()
149+
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
150+
tm.assert_numpy_array_equal(data, np_expected)
151+
tm.assert_numpy_array_equal(mask, mask_expected)
152+
153+
# Empty array
154+
np_expected_empty = np.array([], dtype=np_dtype)
155+
mask_expected_empty = np.array([], dtype=np.bool_)
156+
157+
pa_array_offset = pa.Array.from_buffers(
158+
type=pa_array.type,
159+
length=0,
160+
buffers=[mask_buffer, data_buffer],
161+
offset=pa_array.offset,
162+
)
163+
pa_array_offset.validate()
164+
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
165+
tm.assert_numpy_array_equal(data, np_expected_empty)
166+
tm.assert_numpy_array_equal(mask, mask_expected_empty)

pandas/tests/arrays/test_arrow_utils.py

-59
This file was deleted.

0 commit comments

Comments
 (0)