Skip to content

BUG: Slice Arrow buffer before passing it to numpy (#40896) #41046

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Apr 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,7 @@ I/O
- Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
- Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`)
- Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`)
- Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would not object to actually showing read_parquet reference here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's needed (the original bug report didn't actually involve a plain reading of a Parquet file (not sure this can happen with that), but with a dataset scan+filter operation (from any source format).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I was creating the arrow table using only the arrow api. The bug then occured on the conversion to pandas.

I did not actually try to reproduce the bug using read_parquet, so I'm also not sure if it can happen this way.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose it could happen when reading a parquet file using the new dataset API by passing a certain filter. But anyway, I don't think we need to make that explicit, IMO the above error message is clear enough.


Period
^^^^^^
Expand Down
12 changes: 11 additions & 1 deletion pandas/core/arrays/_arrow_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
on the buffers of the Array.

At the moment pyarrow.BooleanArray is not supported.

Parameters
----------
arr : pyarrow.Array
Expand All @@ -25,8 +27,16 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype):
Tuple of two numpy arrays with the raw data (with specified dtype) and
a boolean mask (validity mask, so False means missing)
"""
dtype = np.dtype(dtype)

buflist = arr.buffers()
data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)]
# Since Arrow buffers might contain padding and the data might be offset,
# the buffer gets sliced here before handing it to numpy.
# See also https://github.com/pandas-dev/pandas/issues/40896
offset = arr.offset * dtype.itemsize
length = len(arr) * dtype.itemsize
data_buf = buflist[1][offset : offset + length]
data = np.frombuffer(data_buf, dtype=dtype)
bitmask = buflist[0]
if bitmask is not None:
mask = pyarrow.BooleanArray.from_buffers(
Expand Down
90 changes: 82 additions & 8 deletions pandas/tests/arrays/masked/test_arrow_compat.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
import pandas._testing as tm

pa = pytest.importorskip("pyarrow", minversion="0.15.0")

from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask

arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES]
arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES]
arrays += [pd.array([True, False, True, None], dtype="boolean")]
Expand All @@ -15,10 +20,8 @@ def data(request):
return request.param


@td.skip_if_no("pyarrow", min_version="0.15.0")
def test_arrow_array(data):
# protocol added in 0.15.0
import pyarrow as pa

arr = pa.array(data)
expected = pa.array(
Expand All @@ -31,7 +34,6 @@ def test_arrow_array(data):
@td.skip_if_no("pyarrow", min_version="0.16.0")
def test_arrow_roundtrip(data):
# roundtrip possible from arrow 0.16.0
import pyarrow as pa

df = pd.DataFrame({"a": data})
table = pa.table(df)
Expand All @@ -44,7 +46,6 @@ def test_arrow_roundtrip(data):
@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
def test_arrow_load_from_zero_chunks(data):
# GH-41040
import pyarrow as pa

df = pd.DataFrame({"a": data[0:0]})
table = pa.table(df)
Expand All @@ -61,7 +62,6 @@ def test_arrow_load_from_zero_chunks(data):
def test_arrow_from_arrow_uint():
# https://github.com/pandas-dev/pandas/issues/31896
# possible mismatch in types
import pyarrow as pa

dtype = pd.UInt32Dtype()
result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
Expand All @@ -73,7 +73,6 @@ def test_arrow_from_arrow_uint():
@td.skip_if_no("pyarrow", min_version="0.16.0")
def test_arrow_sliced(data):
# https://github.com/pandas-dev/pandas/issues/38525
import pyarrow as pa

df = pd.DataFrame({"a": data})
table = pa.table(df)
Expand All @@ -89,12 +88,87 @@ def test_arrow_sliced(data):
tm.assert_frame_equal(result, expected)


@pytest.fixture
def np_dtype_to_arrays(any_real_dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you type this (mainly the output for better readability) also adding a doc-string explaining the purpose

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a fixture in the tests, I don't think we generally ask for annotations there? (eg in conftest.py there is no typing)

np_dtype = np.dtype(any_real_dtype)
pa_type = pa.from_numpy_dtype(np_dtype)

# None ensures the creation of a bitmask buffer.
pa_array = pa.array([0, 1, 2, None], type=pa_type)
# Since masked Arrow buffer slots are not required to contain a specific
# value, assert only the first three values of the created np.array
np_expected = np.array([0, 1, 2], dtype=np_dtype)
mask_expected = np.array([True, True, True, False])
return np_dtype, pa_array, np_expected, mask_expected


def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
"""
Test conversion from pyarrow array to numpy array.

Modifies the pyarrow buffer to contain padding and offset, which are
considered valid buffers by pyarrow.

Also tests empty pyarrow arrays with non empty buffers.
See https://github.com/pandas-dev/pandas/issues/40896
"""
np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays
data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)

mask_buffer = pa_array.buffers()[0]
data_buffer = pa_array.buffers()[1]
data_buffer_bytes = pa_array.buffers()[1].to_pybytes()

# Add trailing padding to the buffer.
data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00")
pa_array_trail = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer, data_buffer_trail],
offset=pa_array.offset,
)
pa_array_trail.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)

# Add offset to the buffer.
offset = b"\x00" * (pa_array.type.bit_width // 8)
data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes)
mask_buffer_offset = pa.py_buffer(b"\x0E")
pa_array_offset = pa.Array.from_buffers(
type=pa_array.type,
length=len(pa_array),
buffers=[mask_buffer_offset, data_buffer_offset],
offset=pa_array.offset + 1,
)
pa_array_offset.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected)
tm.assert_numpy_array_equal(mask, mask_expected)

# Empty array
np_expected_empty = np.array([], dtype=np_dtype)
mask_expected_empty = np.array([], dtype=np.bool_)

pa_array_offset = pa.Array.from_buffers(
type=pa_array.type,
length=0,
buffers=[mask_buffer, data_buffer],
offset=pa_array.offset,
)
pa_array_offset.validate()
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype)
tm.assert_numpy_array_equal(data[:3], np_expected_empty)
tm.assert_numpy_array_equal(mask, mask_expected_empty)


@td.skip_if_no("pyarrow", min_version="0.16.0")
def test_from_arrow_type_error(request, data):
# ensure that __from_arrow__ returns a TypeError when getting a wrong
# array type
import pyarrow as pa

if data.dtype != "boolean":
# TODO numeric dtypes cast any incoming array to the correct dtype
# instead of erroring
Expand Down