-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Slice Arrow buffer before passing it to numpy (#40896) #41046
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f5cc8a8
2a8042c
fd94972
1586d50
5652869
bd70705
ff85a80
df87e14
c195089
f6555df
03648eb
9aa5df6
b86f9fc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,15 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
import pandas.util._test_decorators as td | ||
|
||
import pandas as pd | ||
import pandas._testing as tm | ||
|
||
pa = pytest.importorskip("pyarrow", minversion="0.15.0") | ||
|
||
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask | ||
|
||
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] | ||
arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] | ||
arrays += [pd.array([True, False, True, None], dtype="boolean")] | ||
|
@@ -15,10 +20,8 @@ def data(request): | |
return request.param | ||
|
||
|
||
@td.skip_if_no("pyarrow", min_version="0.15.0") | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def test_arrow_array(data): | ||
# protocol added in 0.15.0 | ||
import pyarrow as pa | ||
|
||
arr = pa.array(data) | ||
expected = pa.array( | ||
|
@@ -31,7 +34,6 @@ def test_arrow_array(data): | |
@td.skip_if_no("pyarrow", min_version="0.16.0") | ||
def test_arrow_roundtrip(data): | ||
# roundtrip possible from arrow 0.16.0 | ||
import pyarrow as pa | ||
|
||
df = pd.DataFrame({"a": data}) | ||
table = pa.table(df) | ||
|
@@ -44,7 +46,6 @@ def test_arrow_roundtrip(data): | |
@td.skip_if_no("pyarrow", min_version="0.15.1.dev") | ||
def test_arrow_load_from_zero_chunks(data): | ||
# GH-41040 | ||
import pyarrow as pa | ||
|
||
df = pd.DataFrame({"a": data[0:0]}) | ||
table = pa.table(df) | ||
|
@@ -61,7 +62,6 @@ def test_arrow_load_from_zero_chunks(data): | |
def test_arrow_from_arrow_uint(): | ||
# https://github.com/pandas-dev/pandas/issues/31896 | ||
# possible mismatch in types | ||
import pyarrow as pa | ||
|
||
dtype = pd.UInt32Dtype() | ||
result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64")) | ||
|
@@ -73,7 +73,6 @@ def test_arrow_from_arrow_uint(): | |
@td.skip_if_no("pyarrow", min_version="0.16.0") | ||
def test_arrow_sliced(data): | ||
# https://github.com/pandas-dev/pandas/issues/38525 | ||
import pyarrow as pa | ||
|
||
df = pd.DataFrame({"a": data}) | ||
table = pa.table(df) | ||
|
@@ -89,12 +88,87 @@ def test_arrow_sliced(data): | |
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
@pytest.fixture | ||
def np_dtype_to_arrays(any_real_dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you type this (mainly the output for better readability) also adding a doc-string explaining the purpose There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a fixture in the tests, I don't think we generally ask for annotations there? (eg in conftest.py there is no typing) |
||
np_dtype = np.dtype(any_real_dtype) | ||
pa_type = pa.from_numpy_dtype(np_dtype) | ||
|
||
# None ensures the creation of a bitmask buffer. | ||
pa_array = pa.array([0, 1, 2, None], type=pa_type) | ||
# Since masked Arrow buffer slots are not required to contain a specific | ||
# value, assert only the first three values of the created np.array | ||
np_expected = np.array([0, 1, 2], dtype=np_dtype) | ||
mask_expected = np.array([True, True, True, False]) | ||
return np_dtype, pa_array, np_expected, mask_expected | ||
|
||
|
||
def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): | ||
""" | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Test conversion from pyarrow array to numpy array. | ||
|
||
Modifies the pyarrow buffer to contain padding and offset, which are | ||
considered valid buffers by pyarrow. | ||
|
||
Also tests empty pyarrow arrays with non empty buffers. | ||
See https://github.com/pandas-dev/pandas/issues/40896 | ||
""" | ||
np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays | ||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype) | ||
tm.assert_numpy_array_equal(data[:3], np_expected) | ||
tm.assert_numpy_array_equal(mask, mask_expected) | ||
|
||
mask_buffer = pa_array.buffers()[0] | ||
data_buffer = pa_array.buffers()[1] | ||
data_buffer_bytes = pa_array.buffers()[1].to_pybytes() | ||
|
||
# Add trailing padding to the buffer. | ||
data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00") | ||
pa_array_trail = pa.Array.from_buffers( | ||
type=pa_array.type, | ||
length=len(pa_array), | ||
buffers=[mask_buffer, data_buffer_trail], | ||
offset=pa_array.offset, | ||
) | ||
pa_array_trail.validate() | ||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype) | ||
tm.assert_numpy_array_equal(data[:3], np_expected) | ||
tm.assert_numpy_array_equal(mask, mask_expected) | ||
|
||
# Add offset to the buffer. | ||
offset = b"\x00" * (pa_array.type.bit_width // 8) | ||
data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes) | ||
mask_buffer_offset = pa.py_buffer(b"\x0E") | ||
pa_array_offset = pa.Array.from_buffers( | ||
type=pa_array.type, | ||
length=len(pa_array), | ||
buffers=[mask_buffer_offset, data_buffer_offset], | ||
offset=pa_array.offset + 1, | ||
) | ||
pa_array_offset.validate() | ||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) | ||
tm.assert_numpy_array_equal(data[:3], np_expected) | ||
tm.assert_numpy_array_equal(mask, mask_expected) | ||
|
||
# Empty array | ||
np_expected_empty = np.array([], dtype=np_dtype) | ||
mask_expected_empty = np.array([], dtype=np.bool_) | ||
|
||
pa_array_offset = pa.Array.from_buffers( | ||
type=pa_array.type, | ||
length=0, | ||
buffers=[mask_buffer, data_buffer], | ||
offset=pa_array.offset, | ||
) | ||
pa_array_offset.validate() | ||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) | ||
tm.assert_numpy_array_equal(data[:3], np_expected_empty) | ||
tm.assert_numpy_array_equal(mask, mask_expected_empty) | ||
|
||
|
||
@td.skip_if_no("pyarrow", min_version="0.16.0") | ||
def test_from_arrow_type_error(request, data): | ||
# ensure that __from_arrow__ returns a TypeError when getting a wrong | ||
# array type | ||
import pyarrow as pa | ||
|
||
if data.dtype != "boolean": | ||
# TODO numeric dtypes cast any incoming array to the correct dtype | ||
# instead of erroring | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would not object to actually showing
read_parquet
reference hereThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think that's needed (the original bug report didn't actually involve a plain reading of a Parquet file (not sure this can happen with that), but with a dataset scan+filter operation (from any source format).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I was creating the arrow table using only the arrow api. The bug then occured on the conversion to pandas.
I did not actually try to reproduce the bug using
read_parquet
, so I'm also not sure if it can happen this way.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suppose it could happen when reading a parquet file using the new dataset API by passing a certain filter. But anyway, I don't think we need to make that explicit, IMO the above error message is clear enough.