-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Slice Arrow buffer before passing it to numpy (#40896) #41046
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
f5cc8a8
2a8042c
fd94972
1586d50
5652869
bd70705
ff85a80
df87e14
c195089
f6555df
03648eb
9aa5df6
b86f9fc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,16 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
import pandas.util._test_decorators as td | ||
|
||
import pandas as pd | ||
import pandas._testing as tm | ||
|
||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just use pyimportorskip here. i am not sure we can test anything w/o it. |
||
import pyarrow as pa | ||
except ImportError: | ||
pa = None | ||
|
||
arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] | ||
arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] | ||
arrays += [pd.array([True, False, True, None], dtype="boolean")] | ||
|
@@ -18,7 +24,6 @@ def data(request): | |
@td.skip_if_no("pyarrow", min_version="0.15.0") | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def test_arrow_array(data): | ||
# protocol added in 0.15.0 | ||
import pyarrow as pa | ||
|
||
arr = pa.array(data) | ||
expected = pa.array( | ||
|
@@ -31,7 +36,6 @@ def test_arrow_array(data): | |
@td.skip_if_no("pyarrow", min_version="0.16.0") | ||
def test_arrow_roundtrip(data): | ||
# roundtrip possible from arrow 0.16.0 | ||
import pyarrow as pa | ||
|
||
df = pd.DataFrame({"a": data}) | ||
table = pa.table(df) | ||
|
@@ -44,7 +48,6 @@ def test_arrow_roundtrip(data): | |
@td.skip_if_no("pyarrow", min_version="0.15.1.dev") | ||
def test_arrow_load_from_zero_chunks(data): | ||
# GH-41040 | ||
import pyarrow as pa | ||
|
||
df = pd.DataFrame({"a": data[0:0]}) | ||
table = pa.table(df) | ||
|
@@ -61,7 +64,6 @@ def test_arrow_load_from_zero_chunks(data): | |
def test_arrow_from_arrow_uint(): | ||
# https://github.com/pandas-dev/pandas/issues/31896 | ||
# possible mismatch in types | ||
import pyarrow as pa | ||
|
||
dtype = pd.UInt32Dtype() | ||
result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64")) | ||
|
@@ -73,7 +75,6 @@ def test_arrow_from_arrow_uint(): | |
@td.skip_if_no("pyarrow", min_version="0.16.0") | ||
def test_arrow_sliced(data): | ||
# https://github.com/pandas-dev/pandas/issues/38525 | ||
import pyarrow as pa | ||
|
||
df = pd.DataFrame({"a": data}) | ||
table = pa.table(df) | ||
|
@@ -89,12 +90,90 @@ def test_arrow_sliced(data): | |
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
@pytest.fixture | ||
def np_dtype_to_arrays(any_real_dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you type this (mainly the output for better readability) also adding a doc-string explaining the purpose There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a fixture in the tests, I don't think we generally ask for annotations there? (eg in conftest.py there is no typing) |
||
np_dtype = np.dtype(any_real_dtype) | ||
pa_type = pa.from_numpy_dtype(np_dtype) | ||
|
||
# None ensures the creation of a bitmask buffer. | ||
pa_array = pa.array([0, 1, 2, None], type=pa_type) | ||
# Since masked Arrow buffer slots are not required to contain a specific | ||
# value, assert only the first three values of the created np.array | ||
np_expected = np.array([0, 1, 2], dtype=np_dtype) | ||
mask_expected = np.array([True, True, True, False]) | ||
return np_dtype, pa_array, np_expected, mask_expected | ||
|
||
|
||
@td.skip_if_no("pyarrow") | ||
def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): | ||
""" | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Test conversion from pyarrow array to numpy array. | ||
|
||
Modifies the pyarrow buffer to contain padding and offset, which are | ||
considered valid buffers by pyarrow. | ||
|
||
Also tests empty pyarrow arrays with non empty buffers. | ||
See https://github.com/pandas-dev/pandas/issues/40896 | ||
""" | ||
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you import at the top (after the pyarrow check is fine) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should I move the rest of the pandas imports below the pyarrow check as well? Looks a bit weird having them seperated by the check. |
||
|
||
np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays | ||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype) | ||
tm.assert_numpy_array_equal(data[:3], np_expected) | ||
tm.assert_numpy_array_equal(mask, mask_expected) | ||
|
||
mask_buffer = pa_array.buffers()[0] | ||
data_buffer = pa_array.buffers()[1] | ||
data_buffer_bytes = pa_array.buffers()[1].to_pybytes() | ||
|
||
# Add trailing padding to the buffer. | ||
data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00") | ||
pa_array_trail = pa.Array.from_buffers( | ||
type=pa_array.type, | ||
length=len(pa_array), | ||
buffers=[mask_buffer, data_buffer_trail], | ||
offset=pa_array.offset, | ||
) | ||
pa_array_trail.validate() | ||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype) | ||
tm.assert_numpy_array_equal(data[:3], np_expected) | ||
tm.assert_numpy_array_equal(mask, mask_expected) | ||
|
||
# Add offset to the buffer. | ||
offset = b"\x00" * (pa_array.type.bit_width // 8) | ||
data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes) | ||
mask_buffer_offset = pa.py_buffer(b"\x0E") | ||
pa_array_offset = pa.Array.from_buffers( | ||
type=pa_array.type, | ||
length=len(pa_array), | ||
buffers=[mask_buffer_offset, data_buffer_offset], | ||
offset=pa_array.offset + 1, | ||
) | ||
pa_array_offset.validate() | ||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) | ||
tm.assert_numpy_array_equal(data[:3], np_expected) | ||
tm.assert_numpy_array_equal(mask, mask_expected) | ||
|
||
# Empty array | ||
np_expected_empty = np.array([], dtype=np_dtype) | ||
mask_expected_empty = np.array([], dtype=np.bool_) | ||
|
||
pa_array_offset = pa.Array.from_buffers( | ||
type=pa_array.type, | ||
length=0, | ||
buffers=[mask_buffer, data_buffer], | ||
offset=pa_array.offset, | ||
) | ||
pa_array_offset.validate() | ||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) | ||
tm.assert_numpy_array_equal(data[:3], np_expected_empty) | ||
tm.assert_numpy_array_equal(mask, mask_expected_empty) | ||
|
||
|
||
@td.skip_if_no("pyarrow", min_version="0.16.0") | ||
def test_from_arrow_type_error(request, data): | ||
# ensure that __from_arrow__ returns a TypeError when getting a wrong | ||
# array type | ||
import pyarrow as pa | ||
|
||
if data.dtype != "boolean": | ||
# TODO numeric dtypes cast any incoming array to the correct dtype | ||
# instead of erroring | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you change this to a user facing note.