Skip to content

Commit 800ade1

Browse files
ThomasBlauthQCJulianWgs
authored andcommitted
BUG: Slice Arrow buffer before passing it to numpy (pandas-dev#40896) (pandas-dev#41046)
1 parent 8c0f63d commit 800ade1

File tree

3 files changed

+94
-9
lines changed

3 files changed

+94
-9
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -798,6 +798,7 @@ I/O
798798
- Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`)
799799
- Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`)
800800
- Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`)
801+
- Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`)
801802

802803
Period
803804
^^^^^^

pandas/core/arrays/_arrow_utils.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
1414
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
1515
on the buffers of the Array.
1616
17+
At the moment pyarrow.BooleanArray is not supported.
18+
1719
Parameters
1820
----------
1921
arr : pyarrow.Array
@@ -25,8 +27,16 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
2527
Tuple of two numpy arrays with the raw data (with specified dtype) and
2628
a boolean mask (validity mask, so False means missing)
2729
"""
30+
dtype = np.dtype(dtype)
31+
2832
buflist = arr.buffers()
29-
data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)]
33+
# Since Arrow buffers might contain padding and the data might be offset,
34+
# the buffer gets sliced here before handing it to numpy.
35+
# See also https://github.com/pandas-dev/pandas/issues/40896
36+
offset = arr.offset * dtype.itemsize
37+
length = len(arr) * dtype.itemsize
38+
data_buf = buflist[1][offset : offset + length]
39+
data = np.frombuffer(data_buf, dtype=dtype)
3040
bitmask = buflist[0]
3141
if bitmask is not None:
3242
mask = pyarrow.BooleanArray.from_buffers(

pandas/tests/arrays/masked/test_arrow_compat.py

+82-8
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
1+
import numpy as np
12
import pytest
23

34
import pandas.util._test_decorators as td
45

56
import pandas as pd
67
import pandas._testing as tm
78

9+
pa = pytest.importorskip("pyarrow", minversion="0.15.0")
10+
11+
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
12+
813
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES]
914
arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
1015
arrays += [pd.array([True, False, True, None], dtype="boolean")]
@@ -15,10 +20,8 @@ def data(request):
1520
return request.param
1621

1722

18-
@td.skip_if_no("pyarrow", min_version="0.15.0")
1923
def test_arrow_array(data):
2024
# protocol added in 0.15.0
21-
import pyarrow as pa
2225

2326
arr = pa.array(data)
2427
expected = pa.array(
@@ -31,7 +34,6 @@ def test_arrow_array(data):
3134
@td.skip_if_no("pyarrow", min_version="0.16.0")
3235
def test_arrow_roundtrip(data):
3336
# roundtrip possible from arrow 0.16.0
34-
import pyarrow as pa
3537

3638
df = pd.DataFrame({"a": data})
3739
table = pa.table(df)
@@ -44,7 +46,6 @@ def test_arrow_roundtrip(data):
4446
@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
4547
def test_arrow_load_from_zero_chunks(data):
4648
# GH-41040
47-
import pyarrow as pa
4849

4950
df = pd.DataFrame({"a": data[0:0]})
5051
table = pa.table(df)
@@ -61,7 +62,6 @@ def test_arrow_load_from_zero_chunks(data):
6162
def test_arrow_from_arrow_uint():
6263
# https://github.com/pandas-dev/pandas/issues/31896
6364
# possible mismatch in types
64-
import pyarrow as pa
6565

6666
dtype = pd.UInt32Dtype()
6767
result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
@@ -73,7 +73,6 @@ def test_arrow_from_arrow_uint():
7373
@td.skip_if_no("pyarrow", min_version="0.16.0")
7474
def test_arrow_sliced(data):
7575
# https://github.com/pandas-dev/pandas/issues/38525
76-
import pyarrow as pa
7776

7877
df = pd.DataFrame({"a": data})
7978
table = pa.table(df)
@@ -89,12 +88,87 @@ def test_arrow_sliced(data):
8988
tm.assert_frame_equal(result, expected)
9089

9190

91+
@pytest.fixture
92+
def np_dtype_to_arrays(any_real_dtype):
93+
np_dtype = np.dtype(any_real_dtype)
94+
pa_type = pa.from_numpy_dtype(np_dtype)
95+
96+
# None ensures the creation of a bitmask buffer.
97+
pa_array = pa.array([0, 1, 2, None], type=pa_type)
98+
# Since masked Arrow buffer slots are not required to contain a specific
99+
# value, assert only the first three values of the created np.array
100+
np_expected = np.array([0, 1, 2], dtype=np_dtype)
101+
mask_expected = np.array([True, True, True, False])
102+
return np_dtype, pa_array, np_expected, mask_expected
103+
104+
105+
def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
106+
"""
107+
Test conversion from pyarrow array to numpy array.
108+
109+
Modifies the pyarrow buffer to contain padding and offset, which are
110+
considered valid buffers by pyarrow.
111+
112+
Also tests empty pyarrow arrays with non empty buffers.
113+
See https://github.com/pandas-dev/pandas/issues/40896
114+
"""
115+
np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays
116+
data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype)
117+
tm.assert_numpy_array_equal(data[:3], np_expected)
118+
tm.assert_numpy_array_equal(mask, mask_expected)
119+
120+
mask_buffer = pa_array.buffers()[0]
121+
data_buffer = pa_array.buffers()[1]
122+
data_buffer_bytes = pa_array.buffers()[1].to_pybytes()
123+
124+
# Add trailing padding to the buffer.
125+
data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00")
126+
pa_array_trail = pa.Array.from_buffers(
127+
type=pa_array.type,
128+
length=len(pa_array),
129+
buffers=[mask_buffer, data_buffer_trail],
130+
offset=pa_array.offset,
131+
)
132+
pa_array_trail.validate()
133+
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype)
134+
tm.assert_numpy_array_equal(data[:3], np_expected)
135+
tm.assert_numpy_array_equal(mask, mask_expected)
136+
137+
# Add offset to the buffer.
138+
offset = b"\x00" * (pa_array.type.bit_width // 8)
139+
data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes)
140+
mask_buffer_offset = pa.py_buffer(b"\x0E")
141+
pa_array_offset = pa.Array.from_buffers(
142+
type=pa_array.type,
143+
length=len(pa_array),
144+
buffers=[mask_buffer_offset, data_buffer_offset],
145+
offset=pa_array.offset + 1,
146+
)
147+
pa_array_offset.validate()
148+
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
149+
tm.assert_numpy_array_equal(data[:3], np_expected)
150+
tm.assert_numpy_array_equal(mask, mask_expected)
151+
152+
# Empty array
153+
np_expected_empty = np.array([], dtype=np_dtype)
154+
mask_expected_empty = np.array([], dtype=np.bool_)
155+
156+
pa_array_offset = pa.Array.from_buffers(
157+
type=pa_array.type,
158+
length=0,
159+
buffers=[mask_buffer, data_buffer],
160+
offset=pa_array.offset,
161+
)
162+
pa_array_offset.validate()
163+
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
164+
tm.assert_numpy_array_equal(data[:3], np_expected_empty)
165+
tm.assert_numpy_array_equal(mask, mask_expected_empty)
166+
167+
92168
@td.skip_if_no("pyarrow", min_version="0.16.0")
93169
def test_from_arrow_type_error(request, data):
94170
# ensure that __from_arrow__ returns a TypeError when getting a wrong
95171
# array type
96-
import pyarrow as pa
97-
98172
if data.dtype != "boolean":
99173
# TODO numeric dtypes cast any incoming array to the correct dtype
100174
# instead of erroring

0 commit comments

Comments
 (0)