Skip to content

BUG: Slice Arrow buffer before passing it to numpy (#40896) #41046

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Apr 28, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion pandas/core/arrays/_arrow_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
on the buffers of the Array.

At the moment pyarrow.BooleanArray is not supported.

Parameters
----------
arr : pyarrow.Array
Expand All @@ -25,8 +27,16 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
Tuple of two numpy arrays with the raw data (with specified dtype) and
a boolean mask (validity mask, so False means missing)
"""
dtype = np.dtype(dtype)

buflist = arr.buffers()
data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)]
# Since Arrow buffers might contain padding and the data might be offset,
# the buffer gets sliced here before handing it to numpy.
# See also https://github.com/pandas-dev/pandas/issues/40896
offset = arr.offset * dtype.itemsize
length = len(arr) * dtype.itemsize
data_buf = buflist[1][offset : offset + length]
data = np.frombuffer(data_buf, dtype=dtype)
bitmask = buflist[0]
if bitmask is not None:
mask = pyarrow.BooleanArray.from_buffers(
Expand Down
100 changes: 100 additions & 0 deletions pandas/tests/arrays/masked/test_arrow_compat.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td
Expand Down Expand Up @@ -64,3 +65,102 @@ def test_arrow_sliced():
result = table.slice(2, None).to_pandas()
expected = df.iloc[2:].reset_index(drop=True)
tm.assert_frame_equal(result, expected)


@pytest.fixture
def np_dtype_to_arrays(request):
import pyarrow as pa
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we just add an import_optional_dependency at the top of this file, rather than having skips on every test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used pyarrow 0.16.0 as the minimal version so test_arrow_array no longer gets run with pyarrow 0.15.0. Hope this is appropriate.


np_dtype = request.param
pa_type = pa.from_numpy_dtype(np_dtype)

pa_array = pa.array([0, 1, 2], type=pa_type)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
pa_array = pa.array([0, 1, 2], type=pa_type)
pa_array = pa.array([0, 1, 2, None], type=pa_type)

Maybe good to include a missing value, to ensure there is a bitmask buffer?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, good point!

np_expected = np.array([0, 1, 2], dtype=np_dtype)
mask_expected = np.array([True, True, True])
return np_dtype, pa_array, np_expected, mask_expected


@td.skip_if_no("pyarrow")
@pytest.mark.parametrize(
"np_dtype_to_arrays",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use the fixture: any_real_dtype or any_numpy_dtype instead

(
[
np.int8(),
np.int16(),
np.int32(),
np.int64(),
np.uint8(),
np.uint16(),
np.uint32(),
np.uint64(),
np.float32(),
np.float64(),
]
),
indirect=True,
)
def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
"""
Test conversion from pyarrow array to numpy array.

Modifies the pyarrow buffer to contain padding and offset, which are
considered valid buffers by pyarrow.

Also tests empty pyarrow arrays with non empty buffers.
See https://github.com/pandas-dev/pandas/issues/40896
"""
import pyarrow as pa

from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you import at the top (after the pyarrow check is fine)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I move the rest of the pandas imports below the pyarrow check as well? Looks a bit weird having them seperated by the check.


np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays
data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype)
tm.assert_numpy_array_equal(data, np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)

mask_buffer = pa_array.buffers()[0]
data_buffer = pa_array.buffers()[1]
data_buffer_bytes = pa_array.buffers()[1].to_pybytes()

# Add trailing padding to the buffer.
data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00")
pa_array_trail = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer, data_buffer_trail],
offset=pa_array.offset,
)
pa_array_trail.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype)
tm.assert_numpy_array_equal(data, np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)

# Add offset to the buffer.
offset = b"\x00" * (pa_array.type.bit_width // 8)
data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes)
mask_buffer_offset = pa.py_buffer(b"\x0F")
pa_array_offset = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer_offset, data_buffer_offset],
offset=pa_array.offset + 1,
)
pa_array_offset.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
tm.assert_numpy_array_equal(data, np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)

# Empty array
np_expected_empty = np.array([], dtype=np_dtype)
mask_expected_empty = np.array([], dtype=np.bool_)

pa_array_offset = pa.Array.from_buffers(
type=pa_array.type,
length=0,
buffers=[mask_buffer, data_buffer],
offset=pa_array.offset,
)
pa_array_offset.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
tm.assert_numpy_array_equal(data, np_expected_empty)
tm.assert_numpy_array_equal(mask, mask_expected_empty)